Added support for multiple districts

This commit is contained in:
Jim Martens 2020-01-09 23:46:40 +01:00
parent 9f89a78763
commit 7ade635756
4 changed files with 89 additions and 27 deletions

View File

@ -2,7 +2,7 @@
![Apache Licence 2.0](https://img.shields.io/pypi/l/twomartens.allrisscraper) ![Apache Licence 2.0](https://img.shields.io/pypi/l/twomartens.allrisscraper)
![Supports Python 3.7 and 3.8](https://img.shields.io/pypi/pyversions/twomartens.allrisscraper) ![Supports Python 3.7 and 3.8](https://img.shields.io/pypi/pyversions/twomartens.allrisscraper)
![version 0.2.2](https://img.shields.io/pypi/v/twomartens.allrisscraper) ![version 0.3.0](https://img.shields.io/pypi/v/twomartens.allrisscraper)
This scraper requires your username and password and performs the following tasks for you: This scraper requires your username and password and performs the following tasks for you:
@ -10,7 +10,8 @@ This scraper requires your username and password and performs the following task
- download of all agendas and motions related to upcoming meetings of committees and plenary sessions - download of all agendas and motions related to upcoming meetings of committees and plenary sessions
- Only considers meetings where you already have been invited formally through ALLRIS. - Only considers meetings where you already have been invited formally through ALLRIS.
**IMPORTANT:** As of now only the Hamburg district of Eimsbüttel is supported. **IMPORTANT:** All districts are supported but official committe abbreviations will only work for
Eimsbüttel as of now.
## Requirements ## Requirements
@ -25,6 +26,22 @@ This scraper requires your username and password and performs the following task
2. Run ALLRIS scraper a first time ``tm-allrisscraper`` (creates config ini in your current working directory) 2. Run ALLRIS scraper a first time ``tm-allrisscraper`` (creates config ini in your current working directory)
3. Fill out the config file with your login credentials and an absolute path on your system to store PDFs of files 3. Fill out the config file with your login credentials and an absolute path on your system to store PDFs of files
## Configuration
```ini
[Default]
; possible values for district: Altona, Bergedorf, Eimsbüttel, Hamburg-Nord,
; Hamburg-Mitte, Harburg, Wandsbek
district = Eimsbüttel
; if you are not from Eimsbüttel your domain ending will differ
username = max.mustermann@eimsbuettel.de
; password is stored in clear text, therefore ini file should have most
; restrictive read permissions
password = VerySecurePassword
; location for storage of PDFs (trailing slash is IMPORTANT)
pdflocation = /path/to/storage/of/PDFs/
```
## Usage after initial setup ## Usage after initial setup
Run ALLRIS scraper: ``tm-allrisscraper`` (takes a few seconds to finish) Run ALLRIS scraper: ``tm-allrisscraper`` (takes a few seconds to finish)

View File

@ -30,7 +30,7 @@ setup(
author="Jim Martens", author="Jim Martens",
author_email="github@2martens.de", author_email="github@2martens.de",
url="https://git.2martens.de/2martens/allris-scraper", url="https://git.2martens.de/2martens/allris-scraper",
version="0.2.2", version="0.3.0",
namespace_packages=["twomartens"], namespace_packages=["twomartens"],
packages=find_packages('src', exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), packages=find_packages('src', exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
package_dir={'': 'src'}, package_dir={'': 'src'},

View File

@ -16,11 +16,43 @@
ABBREVIATIONS = { ABBREVIATIONS = {
"Altona": {
"Haupt": "HA",
},
"Bergedorf": {
"Haupt": "HA",
},
"Eimsbüttel": { "Eimsbüttel": {
"RaLNS": "RaLoNiS", "Haupt": "HA",
"HKS": "HaKuSp", "Kerngebiet": "KGA",
"GNUVWD": "GNUVWDi", "RaLNS": "RaLoNiS",
"AS": "StaPla", "HKS": "HaKuS",
"AU": "Uni" "GNUVWD": "GNUVWDi",
"SAIBGGSG": "SR",
"AS": "StaPla",
"AU": "Uni"
},
"Hamburg-Mitte": {
"Haupt": "HA",
"Stadtplanungs": "StaPla"
},
"Hamburg-Nord": {
"Haupt": "HA",
},
"Harburg": {
"Haupt": "HA",
},
"Wandsbek": {
"Haupt": "HA",
} }
} }
BASE_LINKS = {
"Altona": "https://sitzungsdienst-altona.hamburg.de/ri",
"Bergedorf": "https://sitzungsdienst-bergedorf.hamburg.de/ri",
"Eimsbüttel": "https://sitzungsdienst-eimsbuettel.hamburg.de/ri",
"Hamburg-Mitte": "https://sitzungsdienst-hamburg-mitte.hamburg.de/ri",
"Hamburg-Nord": "https://sitzungsdienst-hamburg-nord.hamburg.de/ri",
"Harburg": "https://sitzungsdienst-harburg.hamburg.de/ri",
"Wandsbek": "https://sitzungsdienst-wandsbek.hamburg.de/ri",
}

View File

@ -35,9 +35,10 @@ ALLRIS_OPEN: str = "https://2martens.de/bezirk-eimsbüttel"
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3112.50 Safari/537.36' user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3112.50 Safari/537.36'
_CONFIG_PROPS = { _CONFIG_PROPS = {
"Default": { "Default": {
"username": "", "district": "Eimsbüttel",
"password": "", "username": "max.mustermann@eimsbuettel.de",
"pdflocation": "" "password": "SehrSicheresPasswort",
"pdflocation": "/Pfad/zum/Ablegen/der/PDFs/"
} }
} }
@ -62,9 +63,11 @@ def main() -> None:
config = configparser.ConfigParser() config = configparser.ConfigParser()
config.read(config_file) config.read(config_file)
district = config["Default"]["district"]
username = config["Default"]["username"] username = config["Default"]["username"]
password = config["Default"]["password"] password = config["Default"]["password"]
pdf_location = config["Default"]["pdflocation"] pdf_location = config["Default"]["pdflocation"]
base_url = definitions.BASE_LINKS[district]
options = Options() options = Options()
options.headless = True options.headless = True
@ -74,9 +77,9 @@ def main() -> None:
driver.get(ALLRIS_LOGIN) driver.get(ALLRIS_LOGIN)
login(driver, username=username, password=password) login(driver, username=username, password=password)
driver.get("https://gateway.hamburg.de/HamburgGateway/Service/StartService/113") driver.get("https://gateway.hamburg.de/HamburgGateway/Service/StartService/113")
driver.get("https://sitzungsdienst-eimsbuettel.hamburg.de/ri/si012.asp") driver.get(f"{base_url}/ri/si012.asp")
meetings = get_meetings(driver) meetings = get_meetings(driver)
download_documents(driver, meetings, pdf_location) download_documents(driver, meetings, pdf_location, base_url, district)
driver.close() driver.close()
@ -105,10 +108,11 @@ def get_meetings(driver: webdriver.WebDriver) -> List[meeting.Meeting]:
return meetings return meetings
def download_documents(driver: webdriver.WebDriver, meetings: List[meeting.Meeting], pdf_location: str) -> None: def download_documents(driver: webdriver.WebDriver, meetings: List[meeting.Meeting],
base_link = "https://sitzungsdienst-eimsbuettel.hamburg.de/ri/do027.asp" pdf_location: str, base_url: str, district: str) -> None:
for meeting in meetings: base_link = f"{base_url}/do027.asp"
driver.get(meeting.link) for _meeting in meetings:
driver.get(_meeting.link)
td = driver.find_element(By.XPATH, "//table[@class='tk1']//td[@class='me1']") td = driver.find_element(By.XPATH, "//table[@class='tk1']//td[@class='me1']")
form_elements = td.find_elements_by_tag_name("form") form_elements = td.find_elements_by_tag_name("form")
agenda_item = form_elements[0] agenda_item = form_elements[0]
@ -118,11 +122,15 @@ def download_documents(driver: webdriver.WebDriver, meetings: List[meeting.Meeti
invitation_item = form_elements[2] invitation_item = form_elements[2]
invitation_link = f"{base_link}?DOLFDNR={invitation_item.find_element_by_name('DOLFDNR').get_property('value')}&options=64" invitation_link = f"{base_link}?DOLFDNR={invitation_item.find_element_by_name('DOLFDNR').get_property('value')}&options=64"
driver.get(agenda_link) driver.get(agenda_link)
save_pdf(driver.current_url, f"{pdf_location}{meeting.date.isoformat()}_{get_abbreviated_committee_name(meeting.name)}/Tagesordnung.pdf") save_pdf(driver.current_url, f"{get_formatted_filename(pdf_location, _meeting, district)}/Tagesordnung.pdf")
driver.get(total_link) driver.get(total_link)
save_pdf(driver.current_url, f"{pdf_location}{meeting.date.isoformat()}_{get_abbreviated_committee_name(meeting.name)}/Mappe.pdf") save_pdf(driver.current_url, f"{get_formatted_filename(pdf_location, _meeting, district)}/Mappe.pdf")
driver.get(invitation_link) driver.get(invitation_link)
save_pdf(driver.current_url, f"{pdf_location}{meeting.date.isoformat()}_{get_abbreviated_committee_name(meeting.name)}/Einladung.pdf") save_pdf(driver.current_url, f"{get_formatted_filename(pdf_location, _meeting, district)}/Einladung.pdf")
def get_formatted_filename(pdf_location: str, meeting_obj: meeting.Meeting, district: str) -> str:
return f"{pdf_location}{meeting_obj.date.isoformat()}_{get_abbreviated_committee_name(meeting_obj.name, district)}"
def save_pdf(url: str, dest: str) -> None: def save_pdf(url: str, dest: str) -> None:
@ -138,29 +146,34 @@ def get_day(date_str: str) -> date:
return date(int(date_elements[-1]), int(date_elements[-2]), int(date_elements[-3])) return date(int(date_elements[-1]), int(date_elements[-2]), int(date_elements[-3]))
def get_abbreviated_committee_name(name: str) -> str: def get_abbreviated_committee_name(name: str, district: str) -> str:
start_committee = "Sitzung des Ausschusses" start_committee = "Sitzung des Ausschusses"
start_regional_committee = "Sitzung des Regionalausschusses" start_regional_committee = "Sitzung des Regionalausschusses"
start_plenary = "Sitzung der Bezirksversammlung" start_plenary = "Sitzung der Bezirksversammlung"
start_youth_help_committee = "Sitzung des Jugendhilfeausschusses" start_youth_help_committee = "Sitzung des Jugendhilfeausschusses"
start_other_committee = "Sitzung des"
end_other_committee = "ausschusses"
abbreviated_name = "" abbreviated_name = ""
if start_plenary in name: if name.startswith(start_plenary):
abbreviated_name = "BV" abbreviated_name = "BV"
elif start_committee in name: elif name.startswith(start_committee):
second_part = name[len(start_committee):] second_part = name[len(start_committee):]
second_split = second_part.split(sep=",") second_split = second_part.split(sep=",")
abbreviated_name = get_abbreviation(second_split) abbreviated_name = get_abbreviation(second_split)
if len(abbreviated_name) == 1: if len(abbreviated_name) == 1:
abbreviated_name = f"A{abbreviated_name}" abbreviated_name = f"A{abbreviated_name}"
elif start_regional_committee in name: elif name.startswith(start_regional_committee):
second_part = name[len(start_regional_committee):] second_part = name[len(start_regional_committee):]
second_split = second_part.split(sep="/") second_split = second_part.split(sep="/")
abbreviated_name = f"Ra{get_abbreviation(second_split)}" abbreviated_name = f"Ra{get_abbreviation(second_split)}"
elif start_youth_help_committee in name: elif name.startswith(start_youth_help_committee):
abbreviated_name = "JHA" abbreviated_name = "JHA"
elif name.startswith(start_other_committee) and name.endswith(end_other_committee):
core_name = name[len(start_other_committee):-len(end_other_committee)]
abbreviated_name = core_name
if abbreviated_name in definitions.ABBREVIATIONS["Eimsbüttel"]: if abbreviated_name in definitions.ABBREVIATIONS[district]:
abbreviated_name = definitions.ABBREVIATIONS["Eimsbüttel"][abbreviated_name] abbreviated_name = definitions.ABBREVIATIONS[district][abbreviated_name]
return abbreviated_name return abbreviated_name