diff --git a/README.md b/README.md index c931273..c025588 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ![Apache Licence 2.0](https://img.shields.io/pypi/l/twomartens.allrisscraper) ![Supports Python 3.7 and 3.8](https://img.shields.io/pypi/pyversions/twomartens.allrisscraper) -![version 0.2.2](https://img.shields.io/pypi/v/twomartens.allrisscraper) +![version 0.3.0](https://img.shields.io/pypi/v/twomartens.allrisscraper) This scraper requires your username and password and performs the following tasks for you: @@ -10,7 +10,8 @@ This scraper requires your username and password and performs the following task - download of all agendas and motions related to upcoming meetings of committees and plenary sessions - Only considers meetings where you already have been invited formally through ALLRIS. -**IMPORTANT:** As of now only the Hamburg district of Eimsbüttel is supported. +**IMPORTANT:** All districts are supported but official committe abbreviations will only work for + Eimsbüttel as of now. ## Requirements @@ -25,6 +26,22 @@ This scraper requires your username and password and performs the following task 2. Run ALLRIS scraper a first time ``tm-allrisscraper`` (creates config ini in your current working directory) 3. Fill out the config file with your login credentials and an absolute path on your system to store PDFs of files +## Configuration + +```ini +[Default] +; possible values for district: Altona, Bergedorf, Eimsbüttel, Hamburg-Nord, +; Hamburg-Mitte, Harburg, Wandsbek +district = Eimsbüttel +; if you are not from Eimsbüttel your domain ending will differ +username = max.mustermann@eimsbuettel.de +; password is stored in clear text, therefore ini file should have most +; restrictive read permissions +password = VerySecurePassword +; location for storage of PDFs (trailing slash is IMPORTANT) +pdflocation = /path/to/storage/of/PDFs/ +``` + ## Usage after initial setup Run ALLRIS scraper: ``tm-allrisscraper`` (takes a few seconds to finish) diff --git a/setup.py b/setup.py index 9e1d0ad..fa22acf 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ setup( author="Jim Martens", author_email="github@2martens.de", url="https://git.2martens.de/2martens/allris-scraper", - version="0.2.2", + version="0.3.0", namespace_packages=["twomartens"], packages=find_packages('src', exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), package_dir={'': 'src'}, diff --git a/src/twomartens/allrisscraper/definitions.py b/src/twomartens/allrisscraper/definitions.py index 6702f24..92a2c45 100644 --- a/src/twomartens/allrisscraper/definitions.py +++ b/src/twomartens/allrisscraper/definitions.py @@ -16,11 +16,43 @@ ABBREVIATIONS = { + "Altona": { + "Haupt": "HA", + }, + "Bergedorf": { + "Haupt": "HA", + }, "Eimsbüttel": { - "RaLNS": "RaLoNiS", - "HKS": "HaKuSp", - "GNUVWD": "GNUVWDi", - "AS": "StaPla", - "AU": "Uni" + "Haupt": "HA", + "Kerngebiet": "KGA", + "RaLNS": "RaLoNiS", + "HKS": "HaKuS", + "GNUVWD": "GNUVWDi", + "SAIBGGSG": "SR", + "AS": "StaPla", + "AU": "Uni" + }, + "Hamburg-Mitte": { + "Haupt": "HA", + "Stadtplanungs": "StaPla" + }, + "Hamburg-Nord": { + "Haupt": "HA", + }, + "Harburg": { + "Haupt": "HA", + }, + "Wandsbek": { + "Haupt": "HA", } } + +BASE_LINKS = { + "Altona": "https://sitzungsdienst-altona.hamburg.de/ri", + "Bergedorf": "https://sitzungsdienst-bergedorf.hamburg.de/ri", + "Eimsbüttel": "https://sitzungsdienst-eimsbuettel.hamburg.de/ri", + "Hamburg-Mitte": "https://sitzungsdienst-hamburg-mitte.hamburg.de/ri", + "Hamburg-Nord": "https://sitzungsdienst-hamburg-nord.hamburg.de/ri", + "Harburg": "https://sitzungsdienst-harburg.hamburg.de/ri", + "Wandsbek": "https://sitzungsdienst-wandsbek.hamburg.de/ri", +} diff --git a/src/twomartens/allrisscraper/main.py b/src/twomartens/allrisscraper/main.py index 1c022cf..963546a 100644 --- a/src/twomartens/allrisscraper/main.py +++ b/src/twomartens/allrisscraper/main.py @@ -35,9 +35,10 @@ ALLRIS_OPEN: str = "https://2martens.de/bezirk-eimsbüttel" user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3112.50 Safari/537.36' _CONFIG_PROPS = { "Default": { - "username": "", - "password": "", - "pdflocation": "" + "district": "Eimsbüttel", + "username": "max.mustermann@eimsbuettel.de", + "password": "SehrSicheresPasswort", + "pdflocation": "/Pfad/zum/Ablegen/der/PDFs/" } } @@ -62,9 +63,11 @@ def main() -> None: config = configparser.ConfigParser() config.read(config_file) + district = config["Default"]["district"] username = config["Default"]["username"] password = config["Default"]["password"] pdf_location = config["Default"]["pdflocation"] + base_url = definitions.BASE_LINKS[district] options = Options() options.headless = True @@ -74,9 +77,9 @@ def main() -> None: driver.get(ALLRIS_LOGIN) login(driver, username=username, password=password) driver.get("https://gateway.hamburg.de/HamburgGateway/Service/StartService/113") - driver.get("https://sitzungsdienst-eimsbuettel.hamburg.de/ri/si012.asp") + driver.get(f"{base_url}/ri/si012.asp") meetings = get_meetings(driver) - download_documents(driver, meetings, pdf_location) + download_documents(driver, meetings, pdf_location, base_url, district) driver.close() @@ -105,10 +108,11 @@ def get_meetings(driver: webdriver.WebDriver) -> List[meeting.Meeting]: return meetings -def download_documents(driver: webdriver.WebDriver, meetings: List[meeting.Meeting], pdf_location: str) -> None: - base_link = "https://sitzungsdienst-eimsbuettel.hamburg.de/ri/do027.asp" - for meeting in meetings: - driver.get(meeting.link) +def download_documents(driver: webdriver.WebDriver, meetings: List[meeting.Meeting], + pdf_location: str, base_url: str, district: str) -> None: + base_link = f"{base_url}/do027.asp" + for _meeting in meetings: + driver.get(_meeting.link) td = driver.find_element(By.XPATH, "//table[@class='tk1']//td[@class='me1']") form_elements = td.find_elements_by_tag_name("form") agenda_item = form_elements[0] @@ -118,11 +122,15 @@ def download_documents(driver: webdriver.WebDriver, meetings: List[meeting.Meeti invitation_item = form_elements[2] invitation_link = f"{base_link}?DOLFDNR={invitation_item.find_element_by_name('DOLFDNR').get_property('value')}&options=64" driver.get(agenda_link) - save_pdf(driver.current_url, f"{pdf_location}{meeting.date.isoformat()}_{get_abbreviated_committee_name(meeting.name)}/Tagesordnung.pdf") + save_pdf(driver.current_url, f"{get_formatted_filename(pdf_location, _meeting, district)}/Tagesordnung.pdf") driver.get(total_link) - save_pdf(driver.current_url, f"{pdf_location}{meeting.date.isoformat()}_{get_abbreviated_committee_name(meeting.name)}/Mappe.pdf") + save_pdf(driver.current_url, f"{get_formatted_filename(pdf_location, _meeting, district)}/Mappe.pdf") driver.get(invitation_link) - save_pdf(driver.current_url, f"{pdf_location}{meeting.date.isoformat()}_{get_abbreviated_committee_name(meeting.name)}/Einladung.pdf") + save_pdf(driver.current_url, f"{get_formatted_filename(pdf_location, _meeting, district)}/Einladung.pdf") + + +def get_formatted_filename(pdf_location: str, meeting_obj: meeting.Meeting, district: str) -> str: + return f"{pdf_location}{meeting_obj.date.isoformat()}_{get_abbreviated_committee_name(meeting_obj.name, district)}" def save_pdf(url: str, dest: str) -> None: @@ -138,29 +146,34 @@ def get_day(date_str: str) -> date: return date(int(date_elements[-1]), int(date_elements[-2]), int(date_elements[-3])) -def get_abbreviated_committee_name(name: str) -> str: +def get_abbreviated_committee_name(name: str, district: str) -> str: start_committee = "Sitzung des Ausschusses" start_regional_committee = "Sitzung des Regionalausschusses" start_plenary = "Sitzung der Bezirksversammlung" start_youth_help_committee = "Sitzung des Jugendhilfeausschusses" + start_other_committee = "Sitzung des" + end_other_committee = "ausschusses" abbreviated_name = "" - if start_plenary in name: + if name.startswith(start_plenary): abbreviated_name = "BV" - elif start_committee in name: + elif name.startswith(start_committee): second_part = name[len(start_committee):] second_split = second_part.split(sep=",") abbreviated_name = get_abbreviation(second_split) if len(abbreviated_name) == 1: abbreviated_name = f"A{abbreviated_name}" - elif start_regional_committee in name: + elif name.startswith(start_regional_committee): second_part = name[len(start_regional_committee):] second_split = second_part.split(sep="/") abbreviated_name = f"Ra{get_abbreviation(second_split)}" - elif start_youth_help_committee in name: + elif name.startswith(start_youth_help_committee): abbreviated_name = "JHA" + elif name.startswith(start_other_committee) and name.endswith(end_other_committee): + core_name = name[len(start_other_committee):-len(end_other_committee)] + abbreviated_name = core_name - if abbreviated_name in definitions.ABBREVIATIONS["Eimsbüttel"]: - abbreviated_name = definitions.ABBREVIATIONS["Eimsbüttel"][abbreviated_name] + if abbreviated_name in definitions.ABBREVIATIONS[district]: + abbreviated_name = definitions.ABBREVIATIONS[district][abbreviated_name] return abbreviated_name