Implemented scraping from public calendar

2020-07-02 23:26:23 +02:00
parent 8f0c9f37e2
commit fe11096263
7 changed files with 305 additions and 67 deletions
--- a/src/twomartens/allrisscraper/agenda.py
+++ b/src/twomartens/allrisscraper/agenda.py
@ -15,18 +15,36 @@
 #   limitations under the License.
 from dataclasses import dataclass
-from typing import List, Dict
+from typing import List
@dataclass
 class Consultation:
    authoritative: bool
    role: str
@dataclass
 class Motion:
-    id: str
+    name: str
-    title: str
+    reference: str
-    text: str
+    type: str
    underDirectionOf: str
    context: str
    petition: str
@dataclass
 class AgendaItem:
    number: str
    order: int
    name: str
    public: bool
    link: str
    motion_link: str
    motion_reference: str
@dataclass
 class Agenda:
-    noticesOfChair: Dict[str, Motion]
+    agendaItems: List[AgendaItem]
    noticesOfAdministration: Dict[str, Motion]
    motions: Dict[str, Motion]
--- a/src/twomartens/allrisscraper/config.py
+++ b/src/twomartens/allrisscraper/config.py
@ -0,0 +1,21 @@
 import configparser
 from twomartens.allrisscraper.definitions import CONFIG_PROPS
 def initialize_config(config_file: str) -> bool:
    try:
        with open(config_file, "r"):
            # if we reach this branch then the file exists and everything is fine
            return True
    except FileNotFoundError:
        with open(config_file, "w") as file:
            parser = configparser.ConfigParser()
            for section in CONFIG_PROPS:
                parser[section] = {}
                for option in CONFIG_PROPS[section]:
                    default = CONFIG_PROPS[section][option]
                    parser[section][option] = default
            parser.write(file)
            return False
--- a/src/twomartens/allrisscraper/custom_json.py
+++ b/src/twomartens/allrisscraper/custom_json.py
@ -0,0 +1,12 @@
 import dataclasses
 import datetime
 import json
 class EnhancedJSONEncoder(json.JSONEncoder):
    def default(self, o):
        if dataclasses.is_dataclass(o):
            return dataclasses.asdict(o)
        if isinstance(o, datetime.date) or isinstance(o, datetime.time):
            return o.__str__()
        return super().default(o)
--- a/src/twomartens/allrisscraper/definitions.py
+++ b/src/twomartens/allrisscraper/definitions.py
@ -13,7 +13,7 @@
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
-
+import calendar
 ABBREVIATIONS = {
    "Altona":        {
@ -56,3 +56,41 @@ BASE_LINKS = {
    "Harburg":       "https://sitzungsdienst-harburg.hamburg.de/ri",
    "Wandsbek":      "https://sitzungsdienst-wandsbek.hamburg.de/ri",
 }
 PUBLIC_BASE_LINKS = {
    "Altona":        "https://sitzungsdienst-altona.hamburg.de/bi",
    "Bergedorf":     "https://sitzungsdienst-bergedorf.hamburg.de/bi",
    "Eimsbüttel":    "https://sitzungsdienst-eimsbuettel.hamburg.de/bi",
    "Hamburg-Mitte": "https://sitzungsdienst-hamburg-mitte.hamburg.de/bi",
    "Hamburg-Nord":  "https://sitzungsdienst-hamburg-nord.hamburg.de/bi",
    "Harburg":       "https://sitzungsdienst-harburg.hamburg.de/bi",
    "Wandsbek":      "https://sitzungsdienst-wandsbek.hamburg.de/bi",
 }
 ALLRIS_LOGIN: str = "https://2martens.de/allris-eimsbüttel"
 ALLRIS_OPEN: str = "https://2martens.de/bezirk-eimsbüttel"
 CONFIG_PROPS = {
    "Default": {
        "district":      "Eimsbüttel",
        "username":      "max.mustermann@eimsbuettel.de",
        "password":      "SehrSicheresPasswort",
        "pdflocation":   "/Pfad/zum/Ablegen/der/PDFs/",
        "jsonLocation":  "/Pfad/zum/Ablegen/der/jsons/",
        "firefoxBinary": "/Pfad/zur/firefox.exe",
    }
 }
 MONTHS = {
    "Januar": 1,
    "Februar": 2,
    "März": 3,
    "April": 4,
    "Mai": 5,
    "Juni": 6,
    "Juli": 7,
    "August": 8,
    "September": 9,
    "Oktober": 10,
    "November": 11,
    "Dezember": 12,
 }
--- a/src/twomartens/allrisscraper/main.py
+++ b/src/twomartens/allrisscraper/main.py
@ -28,39 +28,15 @@ from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
 from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.remote.webelement import WebElement
-from twomartens.allrisscraper import meeting
+from twomartens.allrisscraper import config as config_module
 from twomartens.allrisscraper import definitions
-
+from twomartens.allrisscraper import meeting
-
+from twomartens.allrisscraper.definitions import ALLRIS_LOGIN
 ALLRIS_LOGIN: str = "https://2martens.de/allris-eimsbüttel"
 ALLRIS_OPEN: str = "https://2martens.de/bezirk-eimsbüttel"
 _CONFIG_PROPS = {
    "Default": {
        "district": "Eimsbüttel",
        "username": "max.mustermann@eimsbuettel.de",
        "password": "SehrSicheresPasswort",
        "pdflocation": "/Pfad/zum/Ablegen/der/PDFs/",
        "firefoxBinary": "/Pfad/zur/firefox.exe",
    }
 }
 def main() -> None:
    config_file = f"{os.getcwd()}/tm-allris-scraper-config.ini"
-    try:
+    if not config_module.initialize_config(config_file):
        with open(config_file, "r"):
            # if we reach this branch then the file exists and everything is fine
            pass
    except FileNotFoundError:
        with open(config_file, "w") as file:
            parser = configparser.ConfigParser()
            for section in _CONFIG_PROPS:
                parser[section] = {}
                for option in _CONFIG_PROPS[section]:
                    default = _CONFIG_PROPS[section][option]
                    parser[section][option] = default
            parser.write(file)
        return
    config = configparser.ConfigParser()
@ -83,7 +59,6 @@ def main() -> None:
    driver.get("https://serviceportal.hamburg.de/HamburgGateway/Service/StartService/ALLMAnd")
    driver.get(f"{base_url}/si012.asp")
    meetings = get_meetings(driver)
    fill_agendas_committees(driver, meetings)
    download_documents(driver, meetings, pdf_location, base_url, district)
    driver.close()
@ -108,26 +83,14 @@ def get_meetings(driver: webdriver.Firefox) -> List[meeting.Meeting]:
        agenda_link = tds[4].find_element_by_tag_name("a").get_property("href")
        name = tds[4].find_element_by_tag_name("a").text
        location = tds[5].text
-        meetings.append(meeting.Meeting(name, date_obj, time_obj, agenda_link, location, None))
+        meetings.append(meeting.Meeting(name=name, date=date_obj,
                                        time=time_obj, end_time=None,
                                        link=agenda_link, location=location,
                                        agenda=None, address=None))
    return meetings
 def fill_agendas_committees(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> None:
    notices_of_chair = "Mitteilungen der/des Vorsitzenden"
    notices_of_administration = "Mitteilungen der Verwaltung"
    motions = "Anträge / Vorlagen der Verwaltung"
    for _meeting in meetings:
        driver.get(_meeting.link)
        td = driver.find_element(By.XPATH, "//td[text()='" + notices_of_chair + "']")
        topChair = td.find_element(By.XPATH, '..').find_element(By.CSS_SELECTOR, 'td:first-child').find_element_by_tag_name("a").text
        td = driver.find_element(By.XPATH, "//td[text()='" + notices_of_administration + "']")
        topAdmin = td.find_element(By.XPATH, '..').find_element(By.CSS_SELECTOR, 'td:first-child').find_element_by_tag_name("a").text
        td = driver.find_element(By.XPATH, "//td[text()='" + motions + "']")
        topMotions = td.find_element(By.XPATH, '..').find_element(By.CSS_SELECTOR, 'td:first-child').find_element_by_tag_name("a").text
        pass
 def download_documents(driver: webdriver.Firefox, meetings: List[meeting.Meeting],
                       pdf_location: str, base_url: str, district: str) -> None:
    base_link = f"{base_url}/do027.asp"
--- a/src/twomartens/allrisscraper/meeting.py
+++ b/src/twomartens/allrisscraper/meeting.py
@ -25,6 +25,8 @@ class Meeting:
    name: str
    date: datetime.date
    time: datetime.time
    end_time: Optional[datetime.time]
    link: str
    location: str
    address: Optional[str]
    agenda: Optional[Agenda]
--- a/src/twomartens/allrisscraper/public.py
+++ b/src/twomartens/allrisscraper/public.py
@ -0,0 +1,184 @@
 import configparser
 import dataclasses
 import json
 import os
 from datetime import date
 from datetime import time
 from typing import Dict
 from typing import List
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
 from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.firefox.webelement import FirefoxWebElement
 from selenium.webdriver.remote.webelement import WebElement
 from twomartens.allrisscraper import agenda
 from twomartens.allrisscraper import config as config_module
 from twomartens.allrisscraper import definitions
 from twomartens.allrisscraper import custom_json
 from twomartens.allrisscraper import meeting
 from twomartens.allrisscraper.definitions import MONTHS
 from twomartens.allrisscraper.meeting import Meeting
 def main():
    config_file = f"{os.getcwd()}/tm-allris-scraper-config.ini"
    if not config_module.initialize_config(config_file):
        return
    config = configparser.ConfigParser()
    config.read(config_file)
    district = config["Default"]["district"]
    json_path = config["Default"]["jsonLocation"]
    firefox_binary = config["Default"]["firefoxBinary"]
    base_url = definitions.PUBLIC_BASE_LINKS[district]
    options = Options()
    options.headless = False
    binary = FirefoxBinary(firefox_binary)
    driver = webdriver.Firefox(firefox_binary=binary, options=options)
    driver.implicitly_wait(2)
    driver.get(f"{base_url}/si010_e.asp?MM=6&YY=2020")
    meetings = get_meetings(driver)
    process_agendas(driver, meetings)
    motions = get_motions(driver, meetings)
    driver.close()
    os.makedirs(json_path, exist_ok=True)
    with open(json_path + "meetings.json", "w") as file:
        json.dump(meetings, file,
                  cls=custom_json.EnhancedJSONEncoder)
    with open(json_path + "motions.json", "w") as file:
        json.dump(motions, file,
                  cls=custom_json.EnhancedJSONEncoder)
 def get_meetings(driver: webdriver):
    year_month: str = str(driver.find_element(By.XPATH, "//table[@class='risdeco']//table[1]//tr").text).strip()
    month, year = year_month.split(" ")
    calendar_lines = driver.find_elements(
            By.XPATH,
            "//table[@class='tl1']//tr[not(descendant::td[contains(@colspan, '8')])]"
    )
    meetings = list()
    calendar_lines.remove(calendar_lines[0])
    for line in calendar_lines:
        last_date = None
        if len(meetings):
            last_meeting = meetings[-1]
            last_date = last_meeting.date
        meetings.append(get_meeting(line, month, year, last_date))
    return meetings
 def get_meeting(line: FirefoxWebElement, month: str, year: str, last_date: date) -> Meeting:
    tds = line.find_elements(By.XPATH, "td")
    date_str: str = str(tds[1].text).strip()
    if date_str:
        date_obj = date(int(year), MONTHS.get(month), int(date_str))
    else:
        date_obj = last_date
    start_time, end_time = str(tds[2].text).strip().split(" - ")
    start_time_obj = time.fromisoformat(start_time)
    end_time_obj = time.fromisoformat(end_time)
    name = str(tds[5].find_element_by_tag_name("a").text)
    agenda_link = str(tds[5].find_element_by_tag_name("a").get_property("href"))
    location = str(tds[8].text)
    return meeting.Meeting(name=name, date=date_obj,
                           time=start_time_obj, end_time=end_time_obj,
                           link=agenda_link, location=location,
                           agenda=None, address=None)
 def process_agendas(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> None:
    for meeting_obj in meetings:
        process_agenda(driver, meeting_obj)
 def process_agenda(driver: webdriver.Firefox, meeting_obj: meeting.Meeting) -> None:
    driver.get(meeting_obj.link)
    td = driver.find_element(By.XPATH, "//table[@class='risdeco']//tr[2]//td[2]")
    tables = td.find_elements(By.XPATH, "table")
    meta_table = tables[0]
    agenda_table = tables[1]
    meta_trs = meta_table.find_elements(By.XPATH, "./tbody//tr//td[1]//tr")
    meeting_obj.address = str(meta_trs[5].find_element(By.XPATH, "td[2]").text)
    agenda_item_trs = agenda_table.find_elements(
            By.XPATH,
            ".//tr[not(descendant::th) and not(descendant::td[contains(@colspan, '7')])]")
    agenda_item_trs = agenda_item_trs[:-1]
    agenda_items = list()
    for index, agenda_item_tr in enumerate(agenda_item_trs):
        agenda_items.append(process_agenda_item(index, agenda_item_tr))
    meeting_obj.agenda = agenda.Agenda(agenda_items)
 def process_agenda_item(index: int, item: WebElement) -> agenda.AgendaItem:
    tds = item.find_elements(By.XPATH, "td")
    item_link = str(tds[0].find_element_by_tag_name("a").get_property("href")).strip()
    number = str(tds[0].find_element_by_tag_name("a").text).strip()
    name = str(tds[3].text).strip()
    public = "Ö" in number
    motion_td = str(tds[5].text).strip()
    has_motion = len(motion_td) != 0
    motion_link = None
    motion_reference = None
    if has_motion:
        motion_link = str(tds[5].find_element_by_tag_name("a").get_property("href")).strip()
        motion_reference = str(tds[5].find_element_by_tag_name("a").text).strip()
    return agenda.AgendaItem(number=number, order=index, name=name,
                             public=public, link=item_link,
                             motion_link=motion_link, motion_reference=motion_reference)
 def get_motions(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> Dict[str, agenda.Motion]:
    motions: Dict[str, agenda.Motion] = dict()
    for _meeting in meetings:
        agenda_items = _meeting.agenda.agendaItems
        for agenda_item in agenda_items:
            if agenda_item.motion_link is None:
                continue
            motions[agenda_item.motion_reference] = get_motion(driver, agenda_item.motion_link,
                                                               agenda_item.motion_reference)
    return motions
 def get_motion(driver: webdriver.Firefox, link: str, reference: str) -> agenda.Motion:
    driver.get(link)
    meta_table = driver.find_element(By.XPATH, "//table[@class='risdeco']//tr[2]//td[2]//table//tr//td[1]//table")
    meta_trs = meta_table.find_elements(By.XPATH, "./tbody//tr")
    name = str(meta_trs[0].find_element(By.XPATH, "td[2]").text).strip()
    motion_type = str(meta_trs[1].find_element(By.XPATH, "td[4]").text).strip()
    under_direction_of = str(meta_trs[2].find_element(By.XPATH, "td[2]").text).strip()
    text_divs = driver.find_elements(By.XPATH, "//table[@class='risdeco']//tr[2]//td[2]//div")
    context_div = text_divs[0]
    context_ps = context_div.find_elements_by_tag_name("p")[1:-1]
    context = ""
    for p in context_ps:
        if len(context) > 0:
            context += "\n"
        context += str(p.text).strip()
    petition_div = text_divs[1]
    petition_ps = petition_div.find_elements_by_tag_name("p")[1:-1]
    petition = ""
    for p in petition_ps:
        if len(petition) > 0:
            petition += "\n"
        petition += str(p.text).strip()
    petition.rstrip()
    return agenda.Motion(name=name, reference=reference,
                         type=motion_type, underDirectionOf=under_direction_of,
                         context=context, petition=petition)
 if __name__ == "__main__":
    main()