Added support for multiple districts
This commit is contained in:
parent
9f89a78763
commit
7ade635756
21
README.md
21
README.md
|
@ -2,7 +2,7 @@
|
|||
|
||||
![Apache Licence 2.0](https://img.shields.io/pypi/l/twomartens.allrisscraper)
|
||||
![Supports Python 3.7 and 3.8](https://img.shields.io/pypi/pyversions/twomartens.allrisscraper)
|
||||
![version 0.2.2](https://img.shields.io/pypi/v/twomartens.allrisscraper)
|
||||
![version 0.3.0](https://img.shields.io/pypi/v/twomartens.allrisscraper)
|
||||
|
||||
This scraper requires your username and password and performs the following tasks for you:
|
||||
|
||||
|
@ -10,7 +10,8 @@ This scraper requires your username and password and performs the following task
|
|||
- download of all agendas and motions related to upcoming meetings of committees and plenary sessions
|
||||
- Only considers meetings where you already have been invited formally through ALLRIS.
|
||||
|
||||
**IMPORTANT:** As of now only the Hamburg district of Eimsbüttel is supported.
|
||||
**IMPORTANT:** All districts are supported but official committe abbreviations will only work for
|
||||
Eimsbüttel as of now.
|
||||
|
||||
## Requirements
|
||||
|
||||
|
@ -25,6 +26,22 @@ This scraper requires your username and password and performs the following task
|
|||
2. Run ALLRIS scraper a first time ``tm-allrisscraper`` (creates config ini in your current working directory)
|
||||
3. Fill out the config file with your login credentials and an absolute path on your system to store PDFs of files
|
||||
|
||||
## Configuration
|
||||
|
||||
```ini
|
||||
[Default]
|
||||
; possible values for district: Altona, Bergedorf, Eimsbüttel, Hamburg-Nord,
|
||||
; Hamburg-Mitte, Harburg, Wandsbek
|
||||
district = Eimsbüttel
|
||||
; if you are not from Eimsbüttel your domain ending will differ
|
||||
username = max.mustermann@eimsbuettel.de
|
||||
; password is stored in clear text, therefore ini file should have most
|
||||
; restrictive read permissions
|
||||
password = VerySecurePassword
|
||||
; location for storage of PDFs (trailing slash is IMPORTANT)
|
||||
pdflocation = /path/to/storage/of/PDFs/
|
||||
```
|
||||
|
||||
## Usage after initial setup
|
||||
|
||||
Run ALLRIS scraper: ``tm-allrisscraper`` (takes a few seconds to finish)
|
||||
|
|
2
setup.py
2
setup.py
|
@ -30,7 +30,7 @@ setup(
|
|||
author="Jim Martens",
|
||||
author_email="github@2martens.de",
|
||||
url="https://git.2martens.de/2martens/allris-scraper",
|
||||
version="0.2.2",
|
||||
version="0.3.0",
|
||||
namespace_packages=["twomartens"],
|
||||
packages=find_packages('src', exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
|
||||
package_dir={'': 'src'},
|
||||
|
|
|
@ -16,11 +16,43 @@
|
|||
|
||||
|
||||
ABBREVIATIONS = {
|
||||
"Altona": {
|
||||
"Haupt": "HA",
|
||||
},
|
||||
"Bergedorf": {
|
||||
"Haupt": "HA",
|
||||
},
|
||||
"Eimsbüttel": {
|
||||
"RaLNS": "RaLoNiS",
|
||||
"HKS": "HaKuSp",
|
||||
"GNUVWD": "GNUVWDi",
|
||||
"AS": "StaPla",
|
||||
"AU": "Uni"
|
||||
"Haupt": "HA",
|
||||
"Kerngebiet": "KGA",
|
||||
"RaLNS": "RaLoNiS",
|
||||
"HKS": "HaKuS",
|
||||
"GNUVWD": "GNUVWDi",
|
||||
"SAIBGGSG": "SR",
|
||||
"AS": "StaPla",
|
||||
"AU": "Uni"
|
||||
},
|
||||
"Hamburg-Mitte": {
|
||||
"Haupt": "HA",
|
||||
"Stadtplanungs": "StaPla"
|
||||
},
|
||||
"Hamburg-Nord": {
|
||||
"Haupt": "HA",
|
||||
},
|
||||
"Harburg": {
|
||||
"Haupt": "HA",
|
||||
},
|
||||
"Wandsbek": {
|
||||
"Haupt": "HA",
|
||||
}
|
||||
}
|
||||
|
||||
BASE_LINKS = {
|
||||
"Altona": "https://sitzungsdienst-altona.hamburg.de/ri",
|
||||
"Bergedorf": "https://sitzungsdienst-bergedorf.hamburg.de/ri",
|
||||
"Eimsbüttel": "https://sitzungsdienst-eimsbuettel.hamburg.de/ri",
|
||||
"Hamburg-Mitte": "https://sitzungsdienst-hamburg-mitte.hamburg.de/ri",
|
||||
"Hamburg-Nord": "https://sitzungsdienst-hamburg-nord.hamburg.de/ri",
|
||||
"Harburg": "https://sitzungsdienst-harburg.hamburg.de/ri",
|
||||
"Wandsbek": "https://sitzungsdienst-wandsbek.hamburg.de/ri",
|
||||
}
|
||||
|
|
|
@ -35,9 +35,10 @@ ALLRIS_OPEN: str = "https://2martens.de/bezirk-eimsbüttel"
|
|||
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3112.50 Safari/537.36'
|
||||
_CONFIG_PROPS = {
|
||||
"Default": {
|
||||
"username": "",
|
||||
"password": "",
|
||||
"pdflocation": ""
|
||||
"district": "Eimsbüttel",
|
||||
"username": "max.mustermann@eimsbuettel.de",
|
||||
"password": "SehrSicheresPasswort",
|
||||
"pdflocation": "/Pfad/zum/Ablegen/der/PDFs/"
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -62,9 +63,11 @@ def main() -> None:
|
|||
|
||||
config = configparser.ConfigParser()
|
||||
config.read(config_file)
|
||||
district = config["Default"]["district"]
|
||||
username = config["Default"]["username"]
|
||||
password = config["Default"]["password"]
|
||||
pdf_location = config["Default"]["pdflocation"]
|
||||
base_url = definitions.BASE_LINKS[district]
|
||||
|
||||
options = Options()
|
||||
options.headless = True
|
||||
|
@ -74,9 +77,9 @@ def main() -> None:
|
|||
driver.get(ALLRIS_LOGIN)
|
||||
login(driver, username=username, password=password)
|
||||
driver.get("https://gateway.hamburg.de/HamburgGateway/Service/StartService/113")
|
||||
driver.get("https://sitzungsdienst-eimsbuettel.hamburg.de/ri/si012.asp")
|
||||
driver.get(f"{base_url}/ri/si012.asp")
|
||||
meetings = get_meetings(driver)
|
||||
download_documents(driver, meetings, pdf_location)
|
||||
download_documents(driver, meetings, pdf_location, base_url, district)
|
||||
driver.close()
|
||||
|
||||
|
||||
|
@ -105,10 +108,11 @@ def get_meetings(driver: webdriver.WebDriver) -> List[meeting.Meeting]:
|
|||
return meetings
|
||||
|
||||
|
||||
def download_documents(driver: webdriver.WebDriver, meetings: List[meeting.Meeting], pdf_location: str) -> None:
|
||||
base_link = "https://sitzungsdienst-eimsbuettel.hamburg.de/ri/do027.asp"
|
||||
for meeting in meetings:
|
||||
driver.get(meeting.link)
|
||||
def download_documents(driver: webdriver.WebDriver, meetings: List[meeting.Meeting],
|
||||
pdf_location: str, base_url: str, district: str) -> None:
|
||||
base_link = f"{base_url}/do027.asp"
|
||||
for _meeting in meetings:
|
||||
driver.get(_meeting.link)
|
||||
td = driver.find_element(By.XPATH, "//table[@class='tk1']//td[@class='me1']")
|
||||
form_elements = td.find_elements_by_tag_name("form")
|
||||
agenda_item = form_elements[0]
|
||||
|
@ -118,11 +122,15 @@ def download_documents(driver: webdriver.WebDriver, meetings: List[meeting.Meeti
|
|||
invitation_item = form_elements[2]
|
||||
invitation_link = f"{base_link}?DOLFDNR={invitation_item.find_element_by_name('DOLFDNR').get_property('value')}&options=64"
|
||||
driver.get(agenda_link)
|
||||
save_pdf(driver.current_url, f"{pdf_location}{meeting.date.isoformat()}_{get_abbreviated_committee_name(meeting.name)}/Tagesordnung.pdf")
|
||||
save_pdf(driver.current_url, f"{get_formatted_filename(pdf_location, _meeting, district)}/Tagesordnung.pdf")
|
||||
driver.get(total_link)
|
||||
save_pdf(driver.current_url, f"{pdf_location}{meeting.date.isoformat()}_{get_abbreviated_committee_name(meeting.name)}/Mappe.pdf")
|
||||
save_pdf(driver.current_url, f"{get_formatted_filename(pdf_location, _meeting, district)}/Mappe.pdf")
|
||||
driver.get(invitation_link)
|
||||
save_pdf(driver.current_url, f"{pdf_location}{meeting.date.isoformat()}_{get_abbreviated_committee_name(meeting.name)}/Einladung.pdf")
|
||||
save_pdf(driver.current_url, f"{get_formatted_filename(pdf_location, _meeting, district)}/Einladung.pdf")
|
||||
|
||||
|
||||
def get_formatted_filename(pdf_location: str, meeting_obj: meeting.Meeting, district: str) -> str:
|
||||
return f"{pdf_location}{meeting_obj.date.isoformat()}_{get_abbreviated_committee_name(meeting_obj.name, district)}"
|
||||
|
||||
|
||||
def save_pdf(url: str, dest: str) -> None:
|
||||
|
@ -138,29 +146,34 @@ def get_day(date_str: str) -> date:
|
|||
return date(int(date_elements[-1]), int(date_elements[-2]), int(date_elements[-3]))
|
||||
|
||||
|
||||
def get_abbreviated_committee_name(name: str) -> str:
|
||||
def get_abbreviated_committee_name(name: str, district: str) -> str:
|
||||
start_committee = "Sitzung des Ausschusses"
|
||||
start_regional_committee = "Sitzung des Regionalausschusses"
|
||||
start_plenary = "Sitzung der Bezirksversammlung"
|
||||
start_youth_help_committee = "Sitzung des Jugendhilfeausschusses"
|
||||
start_other_committee = "Sitzung des"
|
||||
end_other_committee = "ausschusses"
|
||||
abbreviated_name = ""
|
||||
if start_plenary in name:
|
||||
if name.startswith(start_plenary):
|
||||
abbreviated_name = "BV"
|
||||
elif start_committee in name:
|
||||
elif name.startswith(start_committee):
|
||||
second_part = name[len(start_committee):]
|
||||
second_split = second_part.split(sep=",")
|
||||
abbreviated_name = get_abbreviation(second_split)
|
||||
if len(abbreviated_name) == 1:
|
||||
abbreviated_name = f"A{abbreviated_name}"
|
||||
elif start_regional_committee in name:
|
||||
elif name.startswith(start_regional_committee):
|
||||
second_part = name[len(start_regional_committee):]
|
||||
second_split = second_part.split(sep="/")
|
||||
abbreviated_name = f"Ra{get_abbreviation(second_split)}"
|
||||
elif start_youth_help_committee in name:
|
||||
elif name.startswith(start_youth_help_committee):
|
||||
abbreviated_name = "JHA"
|
||||
elif name.startswith(start_other_committee) and name.endswith(end_other_committee):
|
||||
core_name = name[len(start_other_committee):-len(end_other_committee)]
|
||||
abbreviated_name = core_name
|
||||
|
||||
if abbreviated_name in definitions.ABBREVIATIONS["Eimsbüttel"]:
|
||||
abbreviated_name = definitions.ABBREVIATIONS["Eimsbüttel"][abbreviated_name]
|
||||
if abbreviated_name in definitions.ABBREVIATIONS[district]:
|
||||
abbreviated_name = definitions.ABBREVIATIONS[district][abbreviated_name]
|
||||
|
||||
return abbreviated_name
|
||||
|
||||
|
|
Loading…
Reference in New Issue