From fe110962636e09d0f010afb46220100929a935e3 Mon Sep 17 00:00:00 2001 From: Jim Martens Date: Thu, 2 Jul 2020 23:26:23 +0200 Subject: [PATCH] Implemented scraping from public calendar --- src/twomartens/allrisscraper/agenda.py | 32 +++- src/twomartens/allrisscraper/config.py | 21 +++ src/twomartens/allrisscraper/custom_json.py | 12 ++ src/twomartens/allrisscraper/definitions.py | 66 +++++-- src/twomartens/allrisscraper/main.py | 55 +----- src/twomartens/allrisscraper/meeting.py | 2 + src/twomartens/allrisscraper/public.py | 184 ++++++++++++++++++++ 7 files changed, 305 insertions(+), 67 deletions(-) create mode 100644 src/twomartens/allrisscraper/config.py create mode 100644 src/twomartens/allrisscraper/custom_json.py create mode 100644 src/twomartens/allrisscraper/public.py diff --git a/src/twomartens/allrisscraper/agenda.py b/src/twomartens/allrisscraper/agenda.py index a303c9d..a1473bd 100644 --- a/src/twomartens/allrisscraper/agenda.py +++ b/src/twomartens/allrisscraper/agenda.py @@ -15,18 +15,36 @@ # limitations under the License. from dataclasses import dataclass -from typing import List, Dict +from typing import List + + +@dataclass +class Consultation: + authoritative: bool + role: str @dataclass class Motion: - id: str - title: str - text: str + name: str + reference: str + type: str + underDirectionOf: str + context: str + petition: str +@dataclass +class AgendaItem: + number: str + order: int + name: str + public: bool + link: str + motion_link: str + motion_reference: str + + @dataclass class Agenda: - noticesOfChair: Dict[str, Motion] - noticesOfAdministration: Dict[str, Motion] - motions: Dict[str, Motion] + agendaItems: List[AgendaItem] diff --git a/src/twomartens/allrisscraper/config.py b/src/twomartens/allrisscraper/config.py new file mode 100644 index 0000000..a4cbb9c --- /dev/null +++ b/src/twomartens/allrisscraper/config.py @@ -0,0 +1,21 @@ +import configparser + +from twomartens.allrisscraper.definitions import CONFIG_PROPS + + +def initialize_config(config_file: str) -> bool: + try: + with open(config_file, "r"): + # if we reach this branch then the file exists and everything is fine + return True + except FileNotFoundError: + with open(config_file, "w") as file: + parser = configparser.ConfigParser() + for section in CONFIG_PROPS: + parser[section] = {} + for option in CONFIG_PROPS[section]: + default = CONFIG_PROPS[section][option] + parser[section][option] = default + + parser.write(file) + return False diff --git a/src/twomartens/allrisscraper/custom_json.py b/src/twomartens/allrisscraper/custom_json.py new file mode 100644 index 0000000..12be13b --- /dev/null +++ b/src/twomartens/allrisscraper/custom_json.py @@ -0,0 +1,12 @@ +import dataclasses +import datetime +import json + + +class EnhancedJSONEncoder(json.JSONEncoder): + def default(self, o): + if dataclasses.is_dataclass(o): + return dataclasses.asdict(o) + if isinstance(o, datetime.date) or isinstance(o, datetime.time): + return o.__str__() + return super().default(o) diff --git a/src/twomartens/allrisscraper/definitions.py b/src/twomartens/allrisscraper/definitions.py index 92a2c45..5e7ca23 100644 --- a/src/twomartens/allrisscraper/definitions.py +++ b/src/twomartens/allrisscraper/definitions.py @@ -13,16 +13,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import calendar ABBREVIATIONS = { - "Altona": { + "Altona": { "Haupt": "HA", }, - "Bergedorf": { + "Bergedorf": { "Haupt": "HA", }, - "Eimsbüttel": { + "Eimsbüttel": { "Haupt": "HA", "Kerngebiet": "KGA", "RaLNS": "RaLoNiS", @@ -33,26 +33,64 @@ ABBREVIATIONS = { "AU": "Uni" }, "Hamburg-Mitte": { - "Haupt": "HA", + "Haupt": "HA", "Stadtplanungs": "StaPla" }, - "Hamburg-Nord": { + "Hamburg-Nord": { "Haupt": "HA", }, - "Harburg": { + "Harburg": { "Haupt": "HA", }, - "Wandsbek": { + "Wandsbek": { "Haupt": "HA", } } BASE_LINKS = { - "Altona": "https://sitzungsdienst-altona.hamburg.de/ri", - "Bergedorf": "https://sitzungsdienst-bergedorf.hamburg.de/ri", - "Eimsbüttel": "https://sitzungsdienst-eimsbuettel.hamburg.de/ri", + "Altona": "https://sitzungsdienst-altona.hamburg.de/ri", + "Bergedorf": "https://sitzungsdienst-bergedorf.hamburg.de/ri", + "Eimsbüttel": "https://sitzungsdienst-eimsbuettel.hamburg.de/ri", "Hamburg-Mitte": "https://sitzungsdienst-hamburg-mitte.hamburg.de/ri", - "Hamburg-Nord": "https://sitzungsdienst-hamburg-nord.hamburg.de/ri", - "Harburg": "https://sitzungsdienst-harburg.hamburg.de/ri", - "Wandsbek": "https://sitzungsdienst-wandsbek.hamburg.de/ri", + "Hamburg-Nord": "https://sitzungsdienst-hamburg-nord.hamburg.de/ri", + "Harburg": "https://sitzungsdienst-harburg.hamburg.de/ri", + "Wandsbek": "https://sitzungsdienst-wandsbek.hamburg.de/ri", +} + +PUBLIC_BASE_LINKS = { + "Altona": "https://sitzungsdienst-altona.hamburg.de/bi", + "Bergedorf": "https://sitzungsdienst-bergedorf.hamburg.de/bi", + "Eimsbüttel": "https://sitzungsdienst-eimsbuettel.hamburg.de/bi", + "Hamburg-Mitte": "https://sitzungsdienst-hamburg-mitte.hamburg.de/bi", + "Hamburg-Nord": "https://sitzungsdienst-hamburg-nord.hamburg.de/bi", + "Harburg": "https://sitzungsdienst-harburg.hamburg.de/bi", + "Wandsbek": "https://sitzungsdienst-wandsbek.hamburg.de/bi", +} + +ALLRIS_LOGIN: str = "https://2martens.de/allris-eimsbüttel" +ALLRIS_OPEN: str = "https://2martens.de/bezirk-eimsbüttel" +CONFIG_PROPS = { + "Default": { + "district": "Eimsbüttel", + "username": "max.mustermann@eimsbuettel.de", + "password": "SehrSicheresPasswort", + "pdflocation": "/Pfad/zum/Ablegen/der/PDFs/", + "jsonLocation": "/Pfad/zum/Ablegen/der/jsons/", + "firefoxBinary": "/Pfad/zur/firefox.exe", + } +} + +MONTHS = { + "Januar": 1, + "Februar": 2, + "März": 3, + "April": 4, + "Mai": 5, + "Juni": 6, + "Juli": 7, + "August": 8, + "September": 9, + "Oktober": 10, + "November": 11, + "Dezember": 12, } diff --git a/src/twomartens/allrisscraper/main.py b/src/twomartens/allrisscraper/main.py index 5f0784f..bad8ff1 100644 --- a/src/twomartens/allrisscraper/main.py +++ b/src/twomartens/allrisscraper/main.py @@ -28,40 +28,16 @@ from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.webdriver.firefox.options import Options from selenium.webdriver.remote.webelement import WebElement -from twomartens.allrisscraper import meeting +from twomartens.allrisscraper import config as config_module from twomartens.allrisscraper import definitions - - -ALLRIS_LOGIN: str = "https://2martens.de/allris-eimsbüttel" -ALLRIS_OPEN: str = "https://2martens.de/bezirk-eimsbüttel" -_CONFIG_PROPS = { - "Default": { - "district": "Eimsbüttel", - "username": "max.mustermann@eimsbuettel.de", - "password": "SehrSicheresPasswort", - "pdflocation": "/Pfad/zum/Ablegen/der/PDFs/", - "firefoxBinary": "/Pfad/zur/firefox.exe", - } -} +from twomartens.allrisscraper import meeting +from twomartens.allrisscraper.definitions import ALLRIS_LOGIN def main() -> None: config_file = f"{os.getcwd()}/tm-allris-scraper-config.ini" - try: - with open(config_file, "r"): - # if we reach this branch then the file exists and everything is fine - pass - except FileNotFoundError: - with open(config_file, "w") as file: - parser = configparser.ConfigParser() - for section in _CONFIG_PROPS: - parser[section] = {} - for option in _CONFIG_PROPS[section]: - default = _CONFIG_PROPS[section][option] - parser[section][option] = default - - parser.write(file) - return + if not config_module.initialize_config(config_file): + return config = configparser.ConfigParser() config.read(config_file) @@ -83,7 +59,6 @@ def main() -> None: driver.get("https://serviceportal.hamburg.de/HamburgGateway/Service/StartService/ALLMAnd") driver.get(f"{base_url}/si012.asp") meetings = get_meetings(driver) - fill_agendas_committees(driver, meetings) download_documents(driver, meetings, pdf_location, base_url, district) driver.close() @@ -108,26 +83,14 @@ def get_meetings(driver: webdriver.Firefox) -> List[meeting.Meeting]: agenda_link = tds[4].find_element_by_tag_name("a").get_property("href") name = tds[4].find_element_by_tag_name("a").text location = tds[5].text - meetings.append(meeting.Meeting(name, date_obj, time_obj, agenda_link, location, None)) + meetings.append(meeting.Meeting(name=name, date=date_obj, + time=time_obj, end_time=None, + link=agenda_link, location=location, + agenda=None, address=None)) return meetings -def fill_agendas_committees(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> None: - notices_of_chair = "Mitteilungen der/des Vorsitzenden" - notices_of_administration = "Mitteilungen der Verwaltung" - motions = "Anträge / Vorlagen der Verwaltung" - for _meeting in meetings: - driver.get(_meeting.link) - td = driver.find_element(By.XPATH, "//td[text()='" + notices_of_chair + "']") - topChair = td.find_element(By.XPATH, '..').find_element(By.CSS_SELECTOR, 'td:first-child').find_element_by_tag_name("a").text - td = driver.find_element(By.XPATH, "//td[text()='" + notices_of_administration + "']") - topAdmin = td.find_element(By.XPATH, '..').find_element(By.CSS_SELECTOR, 'td:first-child').find_element_by_tag_name("a").text - td = driver.find_element(By.XPATH, "//td[text()='" + motions + "']") - topMotions = td.find_element(By.XPATH, '..').find_element(By.CSS_SELECTOR, 'td:first-child').find_element_by_tag_name("a").text - pass - - def download_documents(driver: webdriver.Firefox, meetings: List[meeting.Meeting], pdf_location: str, base_url: str, district: str) -> None: base_link = f"{base_url}/do027.asp" diff --git a/src/twomartens/allrisscraper/meeting.py b/src/twomartens/allrisscraper/meeting.py index f72100d..23eacea 100644 --- a/src/twomartens/allrisscraper/meeting.py +++ b/src/twomartens/allrisscraper/meeting.py @@ -25,6 +25,8 @@ class Meeting: name: str date: datetime.date time: datetime.time + end_time: Optional[datetime.time] link: str location: str + address: Optional[str] agenda: Optional[Agenda] diff --git a/src/twomartens/allrisscraper/public.py b/src/twomartens/allrisscraper/public.py new file mode 100644 index 0000000..2fe277d --- /dev/null +++ b/src/twomartens/allrisscraper/public.py @@ -0,0 +1,184 @@ +import configparser +import dataclasses +import json +import os +from datetime import date +from datetime import time +from typing import Dict +from typing import List + +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.firefox.webelement import FirefoxWebElement +from selenium.webdriver.remote.webelement import WebElement + +from twomartens.allrisscraper import agenda +from twomartens.allrisscraper import config as config_module +from twomartens.allrisscraper import definitions +from twomartens.allrisscraper import custom_json +from twomartens.allrisscraper import meeting +from twomartens.allrisscraper.definitions import MONTHS +from twomartens.allrisscraper.meeting import Meeting + + +def main(): + config_file = f"{os.getcwd()}/tm-allris-scraper-config.ini" + if not config_module.initialize_config(config_file): + return + + config = configparser.ConfigParser() + config.read(config_file) + district = config["Default"]["district"] + json_path = config["Default"]["jsonLocation"] + firefox_binary = config["Default"]["firefoxBinary"] + base_url = definitions.PUBLIC_BASE_LINKS[district] + + options = Options() + options.headless = False + binary = FirefoxBinary(firefox_binary) + driver = webdriver.Firefox(firefox_binary=binary, options=options) + driver.implicitly_wait(2) + driver.get(f"{base_url}/si010_e.asp?MM=6&YY=2020") + meetings = get_meetings(driver) + process_agendas(driver, meetings) + motions = get_motions(driver, meetings) + driver.close() + + os.makedirs(json_path, exist_ok=True) + with open(json_path + "meetings.json", "w") as file: + json.dump(meetings, file, + cls=custom_json.EnhancedJSONEncoder) + with open(json_path + "motions.json", "w") as file: + json.dump(motions, file, + cls=custom_json.EnhancedJSONEncoder) + + +def get_meetings(driver: webdriver): + year_month: str = str(driver.find_element(By.XPATH, "//table[@class='risdeco']//table[1]//tr").text).strip() + month, year = year_month.split(" ") + calendar_lines = driver.find_elements( + By.XPATH, + "//table[@class='tl1']//tr[not(descendant::td[contains(@colspan, '8')])]" + ) + meetings = list() + calendar_lines.remove(calendar_lines[0]) + for line in calendar_lines: + last_date = None + if len(meetings): + last_meeting = meetings[-1] + last_date = last_meeting.date + meetings.append(get_meeting(line, month, year, last_date)) + return meetings + + +def get_meeting(line: FirefoxWebElement, month: str, year: str, last_date: date) -> Meeting: + tds = line.find_elements(By.XPATH, "td") + date_str: str = str(tds[1].text).strip() + if date_str: + date_obj = date(int(year), MONTHS.get(month), int(date_str)) + else: + date_obj = last_date + start_time, end_time = str(tds[2].text).strip().split(" - ") + start_time_obj = time.fromisoformat(start_time) + end_time_obj = time.fromisoformat(end_time) + name = str(tds[5].find_element_by_tag_name("a").text) + agenda_link = str(tds[5].find_element_by_tag_name("a").get_property("href")) + location = str(tds[8].text) + + return meeting.Meeting(name=name, date=date_obj, + time=start_time_obj, end_time=end_time_obj, + link=agenda_link, location=location, + agenda=None, address=None) + + +def process_agendas(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> None: + for meeting_obj in meetings: + process_agenda(driver, meeting_obj) + + +def process_agenda(driver: webdriver.Firefox, meeting_obj: meeting.Meeting) -> None: + driver.get(meeting_obj.link) + td = driver.find_element(By.XPATH, "//table[@class='risdeco']//tr[2]//td[2]") + tables = td.find_elements(By.XPATH, "table") + meta_table = tables[0] + agenda_table = tables[1] + meta_trs = meta_table.find_elements(By.XPATH, "./tbody//tr//td[1]//tr") + meeting_obj.address = str(meta_trs[5].find_element(By.XPATH, "td[2]").text) + + agenda_item_trs = agenda_table.find_elements( + By.XPATH, + ".//tr[not(descendant::th) and not(descendant::td[contains(@colspan, '7')])]") + agenda_item_trs = agenda_item_trs[:-1] + + agenda_items = list() + for index, agenda_item_tr in enumerate(agenda_item_trs): + agenda_items.append(process_agenda_item(index, agenda_item_tr)) + meeting_obj.agenda = agenda.Agenda(agenda_items) + + +def process_agenda_item(index: int, item: WebElement) -> agenda.AgendaItem: + tds = item.find_elements(By.XPATH, "td") + item_link = str(tds[0].find_element_by_tag_name("a").get_property("href")).strip() + number = str(tds[0].find_element_by_tag_name("a").text).strip() + name = str(tds[3].text).strip() + public = "Ö" in number + motion_td = str(tds[5].text).strip() + has_motion = len(motion_td) != 0 + motion_link = None + motion_reference = None + if has_motion: + motion_link = str(tds[5].find_element_by_tag_name("a").get_property("href")).strip() + motion_reference = str(tds[5].find_element_by_tag_name("a").text).strip() + + return agenda.AgendaItem(number=number, order=index, name=name, + public=public, link=item_link, + motion_link=motion_link, motion_reference=motion_reference) + + +def get_motions(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> Dict[str, agenda.Motion]: + motions: Dict[str, agenda.Motion] = dict() + for _meeting in meetings: + agenda_items = _meeting.agenda.agendaItems + for agenda_item in agenda_items: + if agenda_item.motion_link is None: + continue + motions[agenda_item.motion_reference] = get_motion(driver, agenda_item.motion_link, + agenda_item.motion_reference) + return motions + + +def get_motion(driver: webdriver.Firefox, link: str, reference: str) -> agenda.Motion: + driver.get(link) + meta_table = driver.find_element(By.XPATH, "//table[@class='risdeco']//tr[2]//td[2]//table//tr//td[1]//table") + meta_trs = meta_table.find_elements(By.XPATH, "./tbody//tr") + name = str(meta_trs[0].find_element(By.XPATH, "td[2]").text).strip() + motion_type = str(meta_trs[1].find_element(By.XPATH, "td[4]").text).strip() + under_direction_of = str(meta_trs[2].find_element(By.XPATH, "td[2]").text).strip() + + text_divs = driver.find_elements(By.XPATH, "//table[@class='risdeco']//tr[2]//td[2]//div") + context_div = text_divs[0] + context_ps = context_div.find_elements_by_tag_name("p")[1:-1] + context = "" + for p in context_ps: + if len(context) > 0: + context += "\n" + context += str(p.text).strip() + + petition_div = text_divs[1] + petition_ps = petition_div.find_elements_by_tag_name("p")[1:-1] + petition = "" + for p in petition_ps: + if len(petition) > 0: + petition += "\n" + petition += str(p.text).strip() + petition.rstrip() + + return agenda.Motion(name=name, reference=reference, + type=motion_type, underDirectionOf=under_direction_of, + context=context, petition=petition) + + +if __name__ == "__main__": + main()