Added support for multiple districts
This commit is contained in:
parent
9f89a78763
commit
7ade635756
21
README.md
21
README.md
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
![Apache Licence 2.0](https://img.shields.io/pypi/l/twomartens.allrisscraper)
|
![Apache Licence 2.0](https://img.shields.io/pypi/l/twomartens.allrisscraper)
|
||||||
![Supports Python 3.7 and 3.8](https://img.shields.io/pypi/pyversions/twomartens.allrisscraper)
|
![Supports Python 3.7 and 3.8](https://img.shields.io/pypi/pyversions/twomartens.allrisscraper)
|
||||||
![version 0.2.2](https://img.shields.io/pypi/v/twomartens.allrisscraper)
|
![version 0.3.0](https://img.shields.io/pypi/v/twomartens.allrisscraper)
|
||||||
|
|
||||||
This scraper requires your username and password and performs the following tasks for you:
|
This scraper requires your username and password and performs the following tasks for you:
|
||||||
|
|
||||||
|
@ -10,7 +10,8 @@ This scraper requires your username and password and performs the following task
|
||||||
- download of all agendas and motions related to upcoming meetings of committees and plenary sessions
|
- download of all agendas and motions related to upcoming meetings of committees and plenary sessions
|
||||||
- Only considers meetings where you already have been invited formally through ALLRIS.
|
- Only considers meetings where you already have been invited formally through ALLRIS.
|
||||||
|
|
||||||
**IMPORTANT:** As of now only the Hamburg district of Eimsbüttel is supported.
|
**IMPORTANT:** All districts are supported but official committe abbreviations will only work for
|
||||||
|
Eimsbüttel as of now.
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
|
@ -25,6 +26,22 @@ This scraper requires your username and password and performs the following task
|
||||||
2. Run ALLRIS scraper a first time ``tm-allrisscraper`` (creates config ini in your current working directory)
|
2. Run ALLRIS scraper a first time ``tm-allrisscraper`` (creates config ini in your current working directory)
|
||||||
3. Fill out the config file with your login credentials and an absolute path on your system to store PDFs of files
|
3. Fill out the config file with your login credentials and an absolute path on your system to store PDFs of files
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[Default]
|
||||||
|
; possible values for district: Altona, Bergedorf, Eimsbüttel, Hamburg-Nord,
|
||||||
|
; Hamburg-Mitte, Harburg, Wandsbek
|
||||||
|
district = Eimsbüttel
|
||||||
|
; if you are not from Eimsbüttel your domain ending will differ
|
||||||
|
username = max.mustermann@eimsbuettel.de
|
||||||
|
; password is stored in clear text, therefore ini file should have most
|
||||||
|
; restrictive read permissions
|
||||||
|
password = VerySecurePassword
|
||||||
|
; location for storage of PDFs (trailing slash is IMPORTANT)
|
||||||
|
pdflocation = /path/to/storage/of/PDFs/
|
||||||
|
```
|
||||||
|
|
||||||
## Usage after initial setup
|
## Usage after initial setup
|
||||||
|
|
||||||
Run ALLRIS scraper: ``tm-allrisscraper`` (takes a few seconds to finish)
|
Run ALLRIS scraper: ``tm-allrisscraper`` (takes a few seconds to finish)
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -30,7 +30,7 @@ setup(
|
||||||
author="Jim Martens",
|
author="Jim Martens",
|
||||||
author_email="github@2martens.de",
|
author_email="github@2martens.de",
|
||||||
url="https://git.2martens.de/2martens/allris-scraper",
|
url="https://git.2martens.de/2martens/allris-scraper",
|
||||||
version="0.2.2",
|
version="0.3.0",
|
||||||
namespace_packages=["twomartens"],
|
namespace_packages=["twomartens"],
|
||||||
packages=find_packages('src', exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
|
packages=find_packages('src', exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
|
||||||
package_dir={'': 'src'},
|
package_dir={'': 'src'},
|
||||||
|
|
|
@ -16,11 +16,43 @@
|
||||||
|
|
||||||
|
|
||||||
ABBREVIATIONS = {
|
ABBREVIATIONS = {
|
||||||
|
"Altona": {
|
||||||
|
"Haupt": "HA",
|
||||||
|
},
|
||||||
|
"Bergedorf": {
|
||||||
|
"Haupt": "HA",
|
||||||
|
},
|
||||||
"Eimsbüttel": {
|
"Eimsbüttel": {
|
||||||
"RaLNS": "RaLoNiS",
|
"Haupt": "HA",
|
||||||
"HKS": "HaKuSp",
|
"Kerngebiet": "KGA",
|
||||||
"GNUVWD": "GNUVWDi",
|
"RaLNS": "RaLoNiS",
|
||||||
"AS": "StaPla",
|
"HKS": "HaKuS",
|
||||||
"AU": "Uni"
|
"GNUVWD": "GNUVWDi",
|
||||||
|
"SAIBGGSG": "SR",
|
||||||
|
"AS": "StaPla",
|
||||||
|
"AU": "Uni"
|
||||||
|
},
|
||||||
|
"Hamburg-Mitte": {
|
||||||
|
"Haupt": "HA",
|
||||||
|
"Stadtplanungs": "StaPla"
|
||||||
|
},
|
||||||
|
"Hamburg-Nord": {
|
||||||
|
"Haupt": "HA",
|
||||||
|
},
|
||||||
|
"Harburg": {
|
||||||
|
"Haupt": "HA",
|
||||||
|
},
|
||||||
|
"Wandsbek": {
|
||||||
|
"Haupt": "HA",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BASE_LINKS = {
|
||||||
|
"Altona": "https://sitzungsdienst-altona.hamburg.de/ri",
|
||||||
|
"Bergedorf": "https://sitzungsdienst-bergedorf.hamburg.de/ri",
|
||||||
|
"Eimsbüttel": "https://sitzungsdienst-eimsbuettel.hamburg.de/ri",
|
||||||
|
"Hamburg-Mitte": "https://sitzungsdienst-hamburg-mitte.hamburg.de/ri",
|
||||||
|
"Hamburg-Nord": "https://sitzungsdienst-hamburg-nord.hamburg.de/ri",
|
||||||
|
"Harburg": "https://sitzungsdienst-harburg.hamburg.de/ri",
|
||||||
|
"Wandsbek": "https://sitzungsdienst-wandsbek.hamburg.de/ri",
|
||||||
|
}
|
||||||
|
|
|
@ -35,9 +35,10 @@ ALLRIS_OPEN: str = "https://2martens.de/bezirk-eimsbüttel"
|
||||||
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3112.50 Safari/537.36'
|
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3112.50 Safari/537.36'
|
||||||
_CONFIG_PROPS = {
|
_CONFIG_PROPS = {
|
||||||
"Default": {
|
"Default": {
|
||||||
"username": "",
|
"district": "Eimsbüttel",
|
||||||
"password": "",
|
"username": "max.mustermann@eimsbuettel.de",
|
||||||
"pdflocation": ""
|
"password": "SehrSicheresPasswort",
|
||||||
|
"pdflocation": "/Pfad/zum/Ablegen/der/PDFs/"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -62,9 +63,11 @@ def main() -> None:
|
||||||
|
|
||||||
config = configparser.ConfigParser()
|
config = configparser.ConfigParser()
|
||||||
config.read(config_file)
|
config.read(config_file)
|
||||||
|
district = config["Default"]["district"]
|
||||||
username = config["Default"]["username"]
|
username = config["Default"]["username"]
|
||||||
password = config["Default"]["password"]
|
password = config["Default"]["password"]
|
||||||
pdf_location = config["Default"]["pdflocation"]
|
pdf_location = config["Default"]["pdflocation"]
|
||||||
|
base_url = definitions.BASE_LINKS[district]
|
||||||
|
|
||||||
options = Options()
|
options = Options()
|
||||||
options.headless = True
|
options.headless = True
|
||||||
|
@ -74,9 +77,9 @@ def main() -> None:
|
||||||
driver.get(ALLRIS_LOGIN)
|
driver.get(ALLRIS_LOGIN)
|
||||||
login(driver, username=username, password=password)
|
login(driver, username=username, password=password)
|
||||||
driver.get("https://gateway.hamburg.de/HamburgGateway/Service/StartService/113")
|
driver.get("https://gateway.hamburg.de/HamburgGateway/Service/StartService/113")
|
||||||
driver.get("https://sitzungsdienst-eimsbuettel.hamburg.de/ri/si012.asp")
|
driver.get(f"{base_url}/ri/si012.asp")
|
||||||
meetings = get_meetings(driver)
|
meetings = get_meetings(driver)
|
||||||
download_documents(driver, meetings, pdf_location)
|
download_documents(driver, meetings, pdf_location, base_url, district)
|
||||||
driver.close()
|
driver.close()
|
||||||
|
|
||||||
|
|
||||||
|
@ -105,10 +108,11 @@ def get_meetings(driver: webdriver.WebDriver) -> List[meeting.Meeting]:
|
||||||
return meetings
|
return meetings
|
||||||
|
|
||||||
|
|
||||||
def download_documents(driver: webdriver.WebDriver, meetings: List[meeting.Meeting], pdf_location: str) -> None:
|
def download_documents(driver: webdriver.WebDriver, meetings: List[meeting.Meeting],
|
||||||
base_link = "https://sitzungsdienst-eimsbuettel.hamburg.de/ri/do027.asp"
|
pdf_location: str, base_url: str, district: str) -> None:
|
||||||
for meeting in meetings:
|
base_link = f"{base_url}/do027.asp"
|
||||||
driver.get(meeting.link)
|
for _meeting in meetings:
|
||||||
|
driver.get(_meeting.link)
|
||||||
td = driver.find_element(By.XPATH, "//table[@class='tk1']//td[@class='me1']")
|
td = driver.find_element(By.XPATH, "//table[@class='tk1']//td[@class='me1']")
|
||||||
form_elements = td.find_elements_by_tag_name("form")
|
form_elements = td.find_elements_by_tag_name("form")
|
||||||
agenda_item = form_elements[0]
|
agenda_item = form_elements[0]
|
||||||
|
@ -118,11 +122,15 @@ def download_documents(driver: webdriver.WebDriver, meetings: List[meeting.Meeti
|
||||||
invitation_item = form_elements[2]
|
invitation_item = form_elements[2]
|
||||||
invitation_link = f"{base_link}?DOLFDNR={invitation_item.find_element_by_name('DOLFDNR').get_property('value')}&options=64"
|
invitation_link = f"{base_link}?DOLFDNR={invitation_item.find_element_by_name('DOLFDNR').get_property('value')}&options=64"
|
||||||
driver.get(agenda_link)
|
driver.get(agenda_link)
|
||||||
save_pdf(driver.current_url, f"{pdf_location}{meeting.date.isoformat()}_{get_abbreviated_committee_name(meeting.name)}/Tagesordnung.pdf")
|
save_pdf(driver.current_url, f"{get_formatted_filename(pdf_location, _meeting, district)}/Tagesordnung.pdf")
|
||||||
driver.get(total_link)
|
driver.get(total_link)
|
||||||
save_pdf(driver.current_url, f"{pdf_location}{meeting.date.isoformat()}_{get_abbreviated_committee_name(meeting.name)}/Mappe.pdf")
|
save_pdf(driver.current_url, f"{get_formatted_filename(pdf_location, _meeting, district)}/Mappe.pdf")
|
||||||
driver.get(invitation_link)
|
driver.get(invitation_link)
|
||||||
save_pdf(driver.current_url, f"{pdf_location}{meeting.date.isoformat()}_{get_abbreviated_committee_name(meeting.name)}/Einladung.pdf")
|
save_pdf(driver.current_url, f"{get_formatted_filename(pdf_location, _meeting, district)}/Einladung.pdf")
|
||||||
|
|
||||||
|
|
||||||
|
def get_formatted_filename(pdf_location: str, meeting_obj: meeting.Meeting, district: str) -> str:
|
||||||
|
return f"{pdf_location}{meeting_obj.date.isoformat()}_{get_abbreviated_committee_name(meeting_obj.name, district)}"
|
||||||
|
|
||||||
|
|
||||||
def save_pdf(url: str, dest: str) -> None:
|
def save_pdf(url: str, dest: str) -> None:
|
||||||
|
@ -138,29 +146,34 @@ def get_day(date_str: str) -> date:
|
||||||
return date(int(date_elements[-1]), int(date_elements[-2]), int(date_elements[-3]))
|
return date(int(date_elements[-1]), int(date_elements[-2]), int(date_elements[-3]))
|
||||||
|
|
||||||
|
|
||||||
def get_abbreviated_committee_name(name: str) -> str:
|
def get_abbreviated_committee_name(name: str, district: str) -> str:
|
||||||
start_committee = "Sitzung des Ausschusses"
|
start_committee = "Sitzung des Ausschusses"
|
||||||
start_regional_committee = "Sitzung des Regionalausschusses"
|
start_regional_committee = "Sitzung des Regionalausschusses"
|
||||||
start_plenary = "Sitzung der Bezirksversammlung"
|
start_plenary = "Sitzung der Bezirksversammlung"
|
||||||
start_youth_help_committee = "Sitzung des Jugendhilfeausschusses"
|
start_youth_help_committee = "Sitzung des Jugendhilfeausschusses"
|
||||||
|
start_other_committee = "Sitzung des"
|
||||||
|
end_other_committee = "ausschusses"
|
||||||
abbreviated_name = ""
|
abbreviated_name = ""
|
||||||
if start_plenary in name:
|
if name.startswith(start_plenary):
|
||||||
abbreviated_name = "BV"
|
abbreviated_name = "BV"
|
||||||
elif start_committee in name:
|
elif name.startswith(start_committee):
|
||||||
second_part = name[len(start_committee):]
|
second_part = name[len(start_committee):]
|
||||||
second_split = second_part.split(sep=",")
|
second_split = second_part.split(sep=",")
|
||||||
abbreviated_name = get_abbreviation(second_split)
|
abbreviated_name = get_abbreviation(second_split)
|
||||||
if len(abbreviated_name) == 1:
|
if len(abbreviated_name) == 1:
|
||||||
abbreviated_name = f"A{abbreviated_name}"
|
abbreviated_name = f"A{abbreviated_name}"
|
||||||
elif start_regional_committee in name:
|
elif name.startswith(start_regional_committee):
|
||||||
second_part = name[len(start_regional_committee):]
|
second_part = name[len(start_regional_committee):]
|
||||||
second_split = second_part.split(sep="/")
|
second_split = second_part.split(sep="/")
|
||||||
abbreviated_name = f"Ra{get_abbreviation(second_split)}"
|
abbreviated_name = f"Ra{get_abbreviation(second_split)}"
|
||||||
elif start_youth_help_committee in name:
|
elif name.startswith(start_youth_help_committee):
|
||||||
abbreviated_name = "JHA"
|
abbreviated_name = "JHA"
|
||||||
|
elif name.startswith(start_other_committee) and name.endswith(end_other_committee):
|
||||||
|
core_name = name[len(start_other_committee):-len(end_other_committee)]
|
||||||
|
abbreviated_name = core_name
|
||||||
|
|
||||||
if abbreviated_name in definitions.ABBREVIATIONS["Eimsbüttel"]:
|
if abbreviated_name in definitions.ABBREVIATIONS[district]:
|
||||||
abbreviated_name = definitions.ABBREVIATIONS["Eimsbüttel"][abbreviated_name]
|
abbreviated_name = definitions.ABBREVIATIONS[district][abbreviated_name]
|
||||||
|
|
||||||
return abbreviated_name
|
return abbreviated_name
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue