diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f49dfcc --- /dev/null +++ b/setup.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Jim Martens +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""setup.py for allris scraper""" + +from setuptools import find_packages +from setuptools import setup + +with open("README.md", "rb") as f: + long_desc = f.read().decode() + +setup( + name="twomartens.allrisscraper", + description="Scraper for ALLRIS", + long_description=long_desc, + long_description_content_type="text/markdown; charset=UTF-8", + author="Jim Martens", + author_email="github@2martens.de", + url="https://git.2martens.de/2martens/allrisscraper", + version="0.1.0", + namespace_packages=["twomartens"], + packages=find_packages('src', exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), + package_dir={'': 'src'}, + package_data={}, + entry_points={ + "console_scripts": ["tm-allrisscraper = twomartens.allrisscraper.main:main"] + }, + python_requires="~=3.6", + install_requires=["selenium"], + license="Apache License 2.0", + classifiers=[ + "Operating System :: OS Independent", + "Development Status :: 2 - Pre-Alpha", + "License :: OSI Approved :: Apache Software License", + "Environment :: Console", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + ], +) diff --git a/src/twomartens/__init__.py b/src/twomartens/__init__.py new file mode 100644 index 0000000..400d884 --- /dev/null +++ b/src/twomartens/__init__.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 Timon Brüning, Inga Kempfert, Anne Kunstmann, Jim Martens, +# Marius Pierenkemper, Yanneck Reiss +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""twomartens: Namespace package""" + +# See http://peak.telecommunity.com/DevCenter/setuptools#namespace-packages +__import__('pkg_resources').declare_namespace(__name__) diff --git a/src/twomartens/allrisscraper/__init__.py b/src/twomartens/allrisscraper/__init__.py new file mode 100644 index 0000000..d945871 --- /dev/null +++ b/src/twomartens/allrisscraper/__init__.py @@ -0,0 +1,16 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 Timon Brüning, Inga Kempfert, Anne Kunstmann, Jim Martens, +# Marius Pierenkemper, Yanneck Reiss +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/src/twomartens/allrisscraper/main.py b/src/twomartens/allrisscraper/main.py new file mode 100644 index 0000000..90c9270 --- /dev/null +++ b/src/twomartens/allrisscraper/main.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Jim Martens +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import configparser +from urllib import request +from datetime import date +from datetime import time +import os + +from typing import List + +from selenium.webdriver.common.by import By +from selenium.webdriver.firefox import webdriver +from selenium.webdriver.firefox.options import Options + +from twomartens.allrisscraper import meeting + + +ALLRIS_LOGIN: str = "https://2martens.de/allris-eimsbüttel" +ALLRIS_OPEN: str = "https://2martens.de/bezirk-eimsbüttel" +user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3112.50 Safari/537.36' + + +def main() -> None: + config = configparser.ConfigParser() + config.read("config.ini") + username = config["Default"]["username"] + password = config["Default"]["password"] + pdf_location = config["Default"]["pdflocation"] + + options = Options() + options.headless = False + options.add_argument(f"user-agent={user_agent}") + driver = webdriver.WebDriver(options=options) + driver.implicitly_wait(2) + driver.get(ALLRIS_LOGIN) + login(driver, username=username, password=password) + driver.get("https://gateway.hamburg.de/HamburgGateway/Service/StartService/113") + driver.get("https://sitzungsdienst-eimsbuettel.hamburg.de/ri/si012.asp") + meetings = get_meetings(driver) + download_documents(driver, meetings, pdf_location) + + +def login(driver: webdriver.WebDriver, username: str, password: str) -> None: + login_field = driver.find_element_by_id("LoginName") + login_field.send_keys(username) + password_field = driver.find_element_by_id("Password") + password_field.send_keys(password) + button = driver.find_element_by_id("buttonLogin") + button.click() + + +def get_meetings(driver: webdriver.WebDriver) -> List[meeting.Meeting]: + elements = driver.find_elements_by_class_name("zl12") + meetings = list() + for element in elements: + tds = element.find_elements_by_tag_name("td") + date_obj = get_day(tds[0].text) + time_obj = time.fromisoformat(str(tds[1].text).rstrip()) + agenda_link = tds[4].find_element_by_tag_name("a").get_property("href") + name = tds[4].find_element_by_tag_name("a").text + location = tds[5].text + meetings.append(meeting.Meeting(name, date_obj, time_obj, agenda_link, location)) + + return meetings + + +def download_documents(driver: webdriver.WebDriver, meetings: List[meeting.Meeting], pdf_location: str) -> None: + base_link = "https://sitzungsdienst-eimsbuettel.hamburg.de/ri/do027.asp" + for meeting in meetings: + driver.get(meeting.link) + td = driver.find_element(By.XPATH, "//table[@class='tk1']//td[@class='me1']") + form_elements = td.find_elements_by_tag_name("form") + agenda_item = form_elements[0] + agenda_link = f"{base_link}?DOLFDNR={agenda_item.find_element_by_name('DOLFDNR').get_property('value')}&options=64" + total_item = form_elements[1] + total_link = f"{base_link}?DOLFDNR={total_item.find_element_by_name('DOLFDNR').get_property('value')}&options=64" + invitation_item = form_elements[2] + invitation_link = f"{base_link}?DOLFDNR={invitation_item.find_element_by_name('DOLFDNR').get_property('value')}&options=64" + driver.get(agenda_link) + save_pdf(driver.current_url, f"{pdf_location}{meeting.date.isoformat()}-{meeting.name}-Tagesordnung.pdf") + driver.get(total_link) + save_pdf(driver.current_url, f"{pdf_location}{meeting.date.isoformat()}-{meeting.name}-Mappe.pdf") + driver.get(invitation_link) + save_pdf(driver.current_url, f"{pdf_location}{meeting.date.isoformat()}-{meeting.name}-Einladung.pdf") + + +def save_pdf(url: str, dest: str) -> None: + file_data: request = request.urlopen(url) + data_to_write = file_data.read() + with open(dest, "wb") as file: + file.write(data_to_write) + + +def get_day(date_str: str) -> date: + date_elements = date_str[date_str.find(",") + 1:].split(".") + return date(int(date_elements[-1]), int(date_elements[-2]), int(date_elements[-3])) + + +if __name__ == "__main__": + main() diff --git a/src/twomartens/allrisscraper/meeting.py b/src/twomartens/allrisscraper/meeting.py new file mode 100644 index 0000000..21e5fec --- /dev/null +++ b/src/twomartens/allrisscraper/meeting.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Jim Martens +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import datetime +from dataclasses import dataclass + + +@dataclass +class Meeting: + name: str + date: datetime.date + time: datetime.time + link: str + location: str