Implemented scraping from public calendar

This commit is contained in:
2020-07-02 23:26:23 +02:00
parent 8f0c9f37e2
commit fe11096263
7 changed files with 305 additions and 67 deletions

View File

@ -15,18 +15,36 @@
# limitations under the License. # limitations under the License.
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Dict from typing import List
@dataclass
class Consultation:
authoritative: bool
role: str
@dataclass @dataclass
class Motion: class Motion:
id: str name: str
title: str reference: str
text: str type: str
underDirectionOf: str
context: str
petition: str
@dataclass
class AgendaItem:
number: str
order: int
name: str
public: bool
link: str
motion_link: str
motion_reference: str
@dataclass @dataclass
class Agenda: class Agenda:
noticesOfChair: Dict[str, Motion] agendaItems: List[AgendaItem]
noticesOfAdministration: Dict[str, Motion]
motions: Dict[str, Motion]

View File

@ -0,0 +1,21 @@
import configparser
from twomartens.allrisscraper.definitions import CONFIG_PROPS
def initialize_config(config_file: str) -> bool:
try:
with open(config_file, "r"):
# if we reach this branch then the file exists and everything is fine
return True
except FileNotFoundError:
with open(config_file, "w") as file:
parser = configparser.ConfigParser()
for section in CONFIG_PROPS:
parser[section] = {}
for option in CONFIG_PROPS[section]:
default = CONFIG_PROPS[section][option]
parser[section][option] = default
parser.write(file)
return False

View File

@ -0,0 +1,12 @@
import dataclasses
import datetime
import json
class EnhancedJSONEncoder(json.JSONEncoder):
def default(self, o):
if dataclasses.is_dataclass(o):
return dataclasses.asdict(o)
if isinstance(o, datetime.date) or isinstance(o, datetime.time):
return o.__str__()
return super().default(o)

View File

@ -13,7 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import calendar
ABBREVIATIONS = { ABBREVIATIONS = {
"Altona": { "Altona": {
@ -56,3 +56,41 @@ BASE_LINKS = {
"Harburg": "https://sitzungsdienst-harburg.hamburg.de/ri", "Harburg": "https://sitzungsdienst-harburg.hamburg.de/ri",
"Wandsbek": "https://sitzungsdienst-wandsbek.hamburg.de/ri", "Wandsbek": "https://sitzungsdienst-wandsbek.hamburg.de/ri",
} }
PUBLIC_BASE_LINKS = {
"Altona": "https://sitzungsdienst-altona.hamburg.de/bi",
"Bergedorf": "https://sitzungsdienst-bergedorf.hamburg.de/bi",
"Eimsbüttel": "https://sitzungsdienst-eimsbuettel.hamburg.de/bi",
"Hamburg-Mitte": "https://sitzungsdienst-hamburg-mitte.hamburg.de/bi",
"Hamburg-Nord": "https://sitzungsdienst-hamburg-nord.hamburg.de/bi",
"Harburg": "https://sitzungsdienst-harburg.hamburg.de/bi",
"Wandsbek": "https://sitzungsdienst-wandsbek.hamburg.de/bi",
}
ALLRIS_LOGIN: str = "https://2martens.de/allris-eimsbüttel"
ALLRIS_OPEN: str = "https://2martens.de/bezirk-eimsbüttel"
CONFIG_PROPS = {
"Default": {
"district": "Eimsbüttel",
"username": "max.mustermann@eimsbuettel.de",
"password": "SehrSicheresPasswort",
"pdflocation": "/Pfad/zum/Ablegen/der/PDFs/",
"jsonLocation": "/Pfad/zum/Ablegen/der/jsons/",
"firefoxBinary": "/Pfad/zur/firefox.exe",
}
}
MONTHS = {
"Januar": 1,
"Februar": 2,
"März": 3,
"April": 4,
"Mai": 5,
"Juni": 6,
"Juli": 7,
"August": 8,
"September": 9,
"Oktober": 10,
"November": 11,
"Dezember": 12,
}

View File

@ -28,39 +28,15 @@ from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options from selenium.webdriver.firefox.options import Options
from selenium.webdriver.remote.webelement import WebElement from selenium.webdriver.remote.webelement import WebElement
from twomartens.allrisscraper import meeting from twomartens.allrisscraper import config as config_module
from twomartens.allrisscraper import definitions from twomartens.allrisscraper import definitions
from twomartens.allrisscraper import meeting
from twomartens.allrisscraper.definitions import ALLRIS_LOGIN
ALLRIS_LOGIN: str = "https://2martens.de/allris-eimsbüttel"
ALLRIS_OPEN: str = "https://2martens.de/bezirk-eimsbüttel"
_CONFIG_PROPS = {
"Default": {
"district": "Eimsbüttel",
"username": "max.mustermann@eimsbuettel.de",
"password": "SehrSicheresPasswort",
"pdflocation": "/Pfad/zum/Ablegen/der/PDFs/",
"firefoxBinary": "/Pfad/zur/firefox.exe",
}
}
def main() -> None: def main() -> None:
config_file = f"{os.getcwd()}/tm-allris-scraper-config.ini" config_file = f"{os.getcwd()}/tm-allris-scraper-config.ini"
try: if not config_module.initialize_config(config_file):
with open(config_file, "r"):
# if we reach this branch then the file exists and everything is fine
pass
except FileNotFoundError:
with open(config_file, "w") as file:
parser = configparser.ConfigParser()
for section in _CONFIG_PROPS:
parser[section] = {}
for option in _CONFIG_PROPS[section]:
default = _CONFIG_PROPS[section][option]
parser[section][option] = default
parser.write(file)
return return
config = configparser.ConfigParser() config = configparser.ConfigParser()
@ -83,7 +59,6 @@ def main() -> None:
driver.get("https://serviceportal.hamburg.de/HamburgGateway/Service/StartService/ALLMAnd") driver.get("https://serviceportal.hamburg.de/HamburgGateway/Service/StartService/ALLMAnd")
driver.get(f"{base_url}/si012.asp") driver.get(f"{base_url}/si012.asp")
meetings = get_meetings(driver) meetings = get_meetings(driver)
fill_agendas_committees(driver, meetings)
download_documents(driver, meetings, pdf_location, base_url, district) download_documents(driver, meetings, pdf_location, base_url, district)
driver.close() driver.close()
@ -108,26 +83,14 @@ def get_meetings(driver: webdriver.Firefox) -> List[meeting.Meeting]:
agenda_link = tds[4].find_element_by_tag_name("a").get_property("href") agenda_link = tds[4].find_element_by_tag_name("a").get_property("href")
name = tds[4].find_element_by_tag_name("a").text name = tds[4].find_element_by_tag_name("a").text
location = tds[5].text location = tds[5].text
meetings.append(meeting.Meeting(name, date_obj, time_obj, agenda_link, location, None)) meetings.append(meeting.Meeting(name=name, date=date_obj,
time=time_obj, end_time=None,
link=agenda_link, location=location,
agenda=None, address=None))
return meetings return meetings
def fill_agendas_committees(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> None:
notices_of_chair = "Mitteilungen der/des Vorsitzenden"
notices_of_administration = "Mitteilungen der Verwaltung"
motions = "Anträge / Vorlagen der Verwaltung"
for _meeting in meetings:
driver.get(_meeting.link)
td = driver.find_element(By.XPATH, "//td[text()='" + notices_of_chair + "']")
topChair = td.find_element(By.XPATH, '..').find_element(By.CSS_SELECTOR, 'td:first-child').find_element_by_tag_name("a").text
td = driver.find_element(By.XPATH, "//td[text()='" + notices_of_administration + "']")
topAdmin = td.find_element(By.XPATH, '..').find_element(By.CSS_SELECTOR, 'td:first-child').find_element_by_tag_name("a").text
td = driver.find_element(By.XPATH, "//td[text()='" + motions + "']")
topMotions = td.find_element(By.XPATH, '..').find_element(By.CSS_SELECTOR, 'td:first-child').find_element_by_tag_name("a").text
pass
def download_documents(driver: webdriver.Firefox, meetings: List[meeting.Meeting], def download_documents(driver: webdriver.Firefox, meetings: List[meeting.Meeting],
pdf_location: str, base_url: str, district: str) -> None: pdf_location: str, base_url: str, district: str) -> None:
base_link = f"{base_url}/do027.asp" base_link = f"{base_url}/do027.asp"

View File

@ -25,6 +25,8 @@ class Meeting:
name: str name: str
date: datetime.date date: datetime.date
time: datetime.time time: datetime.time
end_time: Optional[datetime.time]
link: str link: str
location: str location: str
address: Optional[str]
agenda: Optional[Agenda] agenda: Optional[Agenda]

View File

@ -0,0 +1,184 @@
import configparser
import dataclasses
import json
import os
from datetime import date
from datetime import time
from typing import Dict
from typing import List
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.webelement import FirefoxWebElement
from selenium.webdriver.remote.webelement import WebElement
from twomartens.allrisscraper import agenda
from twomartens.allrisscraper import config as config_module
from twomartens.allrisscraper import definitions
from twomartens.allrisscraper import custom_json
from twomartens.allrisscraper import meeting
from twomartens.allrisscraper.definitions import MONTHS
from twomartens.allrisscraper.meeting import Meeting
def main():
config_file = f"{os.getcwd()}/tm-allris-scraper-config.ini"
if not config_module.initialize_config(config_file):
return
config = configparser.ConfigParser()
config.read(config_file)
district = config["Default"]["district"]
json_path = config["Default"]["jsonLocation"]
firefox_binary = config["Default"]["firefoxBinary"]
base_url = definitions.PUBLIC_BASE_LINKS[district]
options = Options()
options.headless = False
binary = FirefoxBinary(firefox_binary)
driver = webdriver.Firefox(firefox_binary=binary, options=options)
driver.implicitly_wait(2)
driver.get(f"{base_url}/si010_e.asp?MM=6&YY=2020")
meetings = get_meetings(driver)
process_agendas(driver, meetings)
motions = get_motions(driver, meetings)
driver.close()
os.makedirs(json_path, exist_ok=True)
with open(json_path + "meetings.json", "w") as file:
json.dump(meetings, file,
cls=custom_json.EnhancedJSONEncoder)
with open(json_path + "motions.json", "w") as file:
json.dump(motions, file,
cls=custom_json.EnhancedJSONEncoder)
def get_meetings(driver: webdriver):
year_month: str = str(driver.find_element(By.XPATH, "//table[@class='risdeco']//table[1]//tr").text).strip()
month, year = year_month.split(" ")
calendar_lines = driver.find_elements(
By.XPATH,
"//table[@class='tl1']//tr[not(descendant::td[contains(@colspan, '8')])]"
)
meetings = list()
calendar_lines.remove(calendar_lines[0])
for line in calendar_lines:
last_date = None
if len(meetings):
last_meeting = meetings[-1]
last_date = last_meeting.date
meetings.append(get_meeting(line, month, year, last_date))
return meetings
def get_meeting(line: FirefoxWebElement, month: str, year: str, last_date: date) -> Meeting:
tds = line.find_elements(By.XPATH, "td")
date_str: str = str(tds[1].text).strip()
if date_str:
date_obj = date(int(year), MONTHS.get(month), int(date_str))
else:
date_obj = last_date
start_time, end_time = str(tds[2].text).strip().split(" - ")
start_time_obj = time.fromisoformat(start_time)
end_time_obj = time.fromisoformat(end_time)
name = str(tds[5].find_element_by_tag_name("a").text)
agenda_link = str(tds[5].find_element_by_tag_name("a").get_property("href"))
location = str(tds[8].text)
return meeting.Meeting(name=name, date=date_obj,
time=start_time_obj, end_time=end_time_obj,
link=agenda_link, location=location,
agenda=None, address=None)
def process_agendas(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> None:
for meeting_obj in meetings:
process_agenda(driver, meeting_obj)
def process_agenda(driver: webdriver.Firefox, meeting_obj: meeting.Meeting) -> None:
driver.get(meeting_obj.link)
td = driver.find_element(By.XPATH, "//table[@class='risdeco']//tr[2]//td[2]")
tables = td.find_elements(By.XPATH, "table")
meta_table = tables[0]
agenda_table = tables[1]
meta_trs = meta_table.find_elements(By.XPATH, "./tbody//tr//td[1]//tr")
meeting_obj.address = str(meta_trs[5].find_element(By.XPATH, "td[2]").text)
agenda_item_trs = agenda_table.find_elements(
By.XPATH,
".//tr[not(descendant::th) and not(descendant::td[contains(@colspan, '7')])]")
agenda_item_trs = agenda_item_trs[:-1]
agenda_items = list()
for index, agenda_item_tr in enumerate(agenda_item_trs):
agenda_items.append(process_agenda_item(index, agenda_item_tr))
meeting_obj.agenda = agenda.Agenda(agenda_items)
def process_agenda_item(index: int, item: WebElement) -> agenda.AgendaItem:
tds = item.find_elements(By.XPATH, "td")
item_link = str(tds[0].find_element_by_tag_name("a").get_property("href")).strip()
number = str(tds[0].find_element_by_tag_name("a").text).strip()
name = str(tds[3].text).strip()
public = "Ö" in number
motion_td = str(tds[5].text).strip()
has_motion = len(motion_td) != 0
motion_link = None
motion_reference = None
if has_motion:
motion_link = str(tds[5].find_element_by_tag_name("a").get_property("href")).strip()
motion_reference = str(tds[5].find_element_by_tag_name("a").text).strip()
return agenda.AgendaItem(number=number, order=index, name=name,
public=public, link=item_link,
motion_link=motion_link, motion_reference=motion_reference)
def get_motions(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> Dict[str, agenda.Motion]:
motions: Dict[str, agenda.Motion] = dict()
for _meeting in meetings:
agenda_items = _meeting.agenda.agendaItems
for agenda_item in agenda_items:
if agenda_item.motion_link is None:
continue
motions[agenda_item.motion_reference] = get_motion(driver, agenda_item.motion_link,
agenda_item.motion_reference)
return motions
def get_motion(driver: webdriver.Firefox, link: str, reference: str) -> agenda.Motion:
driver.get(link)
meta_table = driver.find_element(By.XPATH, "//table[@class='risdeco']//tr[2]//td[2]//table//tr//td[1]//table")
meta_trs = meta_table.find_elements(By.XPATH, "./tbody//tr")
name = str(meta_trs[0].find_element(By.XPATH, "td[2]").text).strip()
motion_type = str(meta_trs[1].find_element(By.XPATH, "td[4]").text).strip()
under_direction_of = str(meta_trs[2].find_element(By.XPATH, "td[2]").text).strip()
text_divs = driver.find_elements(By.XPATH, "//table[@class='risdeco']//tr[2]//td[2]//div")
context_div = text_divs[0]
context_ps = context_div.find_elements_by_tag_name("p")[1:-1]
context = ""
for p in context_ps:
if len(context) > 0:
context += "\n"
context += str(p.text).strip()
petition_div = text_divs[1]
petition_ps = petition_div.find_elements_by_tag_name("p")[1:-1]
petition = ""
for p in petition_ps:
if len(petition) > 0:
petition += "\n"
petition += str(p.text).strip()
petition.rstrip()
return agenda.Motion(name=name, reference=reference,
type=motion_type, underDirectionOf=under_direction_of,
context=context, petition=petition)
if __name__ == "__main__":
main()