Implemented scraping from public calendar
This commit is contained in:
@ -15,18 +15,36 @@
|
||||
# limitations under the License.
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict
|
||||
from typing import List
|
||||
|
||||
|
||||
@dataclass
|
||||
class Consultation:
|
||||
authoritative: bool
|
||||
role: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Motion:
|
||||
id: str
|
||||
title: str
|
||||
text: str
|
||||
name: str
|
||||
reference: str
|
||||
type: str
|
||||
underDirectionOf: str
|
||||
context: str
|
||||
petition: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class AgendaItem:
|
||||
number: str
|
||||
order: int
|
||||
name: str
|
||||
public: bool
|
||||
link: str
|
||||
motion_link: str
|
||||
motion_reference: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Agenda:
|
||||
noticesOfChair: Dict[str, Motion]
|
||||
noticesOfAdministration: Dict[str, Motion]
|
||||
motions: Dict[str, Motion]
|
||||
agendaItems: List[AgendaItem]
|
||||
|
||||
21
src/twomartens/allrisscraper/config.py
Normal file
21
src/twomartens/allrisscraper/config.py
Normal file
@ -0,0 +1,21 @@
|
||||
import configparser
|
||||
|
||||
from twomartens.allrisscraper.definitions import CONFIG_PROPS
|
||||
|
||||
|
||||
def initialize_config(config_file: str) -> bool:
|
||||
try:
|
||||
with open(config_file, "r"):
|
||||
# if we reach this branch then the file exists and everything is fine
|
||||
return True
|
||||
except FileNotFoundError:
|
||||
with open(config_file, "w") as file:
|
||||
parser = configparser.ConfigParser()
|
||||
for section in CONFIG_PROPS:
|
||||
parser[section] = {}
|
||||
for option in CONFIG_PROPS[section]:
|
||||
default = CONFIG_PROPS[section][option]
|
||||
parser[section][option] = default
|
||||
|
||||
parser.write(file)
|
||||
return False
|
||||
12
src/twomartens/allrisscraper/custom_json.py
Normal file
12
src/twomartens/allrisscraper/custom_json.py
Normal file
@ -0,0 +1,12 @@
|
||||
import dataclasses
|
||||
import datetime
|
||||
import json
|
||||
|
||||
|
||||
class EnhancedJSONEncoder(json.JSONEncoder):
|
||||
def default(self, o):
|
||||
if dataclasses.is_dataclass(o):
|
||||
return dataclasses.asdict(o)
|
||||
if isinstance(o, datetime.date) or isinstance(o, datetime.time):
|
||||
return o.__str__()
|
||||
return super().default(o)
|
||||
@ -13,7 +13,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import calendar
|
||||
|
||||
ABBREVIATIONS = {
|
||||
"Altona": {
|
||||
@ -56,3 +56,41 @@ BASE_LINKS = {
|
||||
"Harburg": "https://sitzungsdienst-harburg.hamburg.de/ri",
|
||||
"Wandsbek": "https://sitzungsdienst-wandsbek.hamburg.de/ri",
|
||||
}
|
||||
|
||||
PUBLIC_BASE_LINKS = {
|
||||
"Altona": "https://sitzungsdienst-altona.hamburg.de/bi",
|
||||
"Bergedorf": "https://sitzungsdienst-bergedorf.hamburg.de/bi",
|
||||
"Eimsbüttel": "https://sitzungsdienst-eimsbuettel.hamburg.de/bi",
|
||||
"Hamburg-Mitte": "https://sitzungsdienst-hamburg-mitte.hamburg.de/bi",
|
||||
"Hamburg-Nord": "https://sitzungsdienst-hamburg-nord.hamburg.de/bi",
|
||||
"Harburg": "https://sitzungsdienst-harburg.hamburg.de/bi",
|
||||
"Wandsbek": "https://sitzungsdienst-wandsbek.hamburg.de/bi",
|
||||
}
|
||||
|
||||
ALLRIS_LOGIN: str = "https://2martens.de/allris-eimsbüttel"
|
||||
ALLRIS_OPEN: str = "https://2martens.de/bezirk-eimsbüttel"
|
||||
CONFIG_PROPS = {
|
||||
"Default": {
|
||||
"district": "Eimsbüttel",
|
||||
"username": "max.mustermann@eimsbuettel.de",
|
||||
"password": "SehrSicheresPasswort",
|
||||
"pdflocation": "/Pfad/zum/Ablegen/der/PDFs/",
|
||||
"jsonLocation": "/Pfad/zum/Ablegen/der/jsons/",
|
||||
"firefoxBinary": "/Pfad/zur/firefox.exe",
|
||||
}
|
||||
}
|
||||
|
||||
MONTHS = {
|
||||
"Januar": 1,
|
||||
"Februar": 2,
|
||||
"März": 3,
|
||||
"April": 4,
|
||||
"Mai": 5,
|
||||
"Juni": 6,
|
||||
"Juli": 7,
|
||||
"August": 8,
|
||||
"September": 9,
|
||||
"Oktober": 10,
|
||||
"November": 11,
|
||||
"Dezember": 12,
|
||||
}
|
||||
|
||||
@ -28,39 +28,15 @@ from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.remote.webelement import WebElement
|
||||
|
||||
from twomartens.allrisscraper import meeting
|
||||
from twomartens.allrisscraper import config as config_module
|
||||
from twomartens.allrisscraper import definitions
|
||||
|
||||
|
||||
ALLRIS_LOGIN: str = "https://2martens.de/allris-eimsbüttel"
|
||||
ALLRIS_OPEN: str = "https://2martens.de/bezirk-eimsbüttel"
|
||||
_CONFIG_PROPS = {
|
||||
"Default": {
|
||||
"district": "Eimsbüttel",
|
||||
"username": "max.mustermann@eimsbuettel.de",
|
||||
"password": "SehrSicheresPasswort",
|
||||
"pdflocation": "/Pfad/zum/Ablegen/der/PDFs/",
|
||||
"firefoxBinary": "/Pfad/zur/firefox.exe",
|
||||
}
|
||||
}
|
||||
from twomartens.allrisscraper import meeting
|
||||
from twomartens.allrisscraper.definitions import ALLRIS_LOGIN
|
||||
|
||||
|
||||
def main() -> None:
|
||||
config_file = f"{os.getcwd()}/tm-allris-scraper-config.ini"
|
||||
try:
|
||||
with open(config_file, "r"):
|
||||
# if we reach this branch then the file exists and everything is fine
|
||||
pass
|
||||
except FileNotFoundError:
|
||||
with open(config_file, "w") as file:
|
||||
parser = configparser.ConfigParser()
|
||||
for section in _CONFIG_PROPS:
|
||||
parser[section] = {}
|
||||
for option in _CONFIG_PROPS[section]:
|
||||
default = _CONFIG_PROPS[section][option]
|
||||
parser[section][option] = default
|
||||
|
||||
parser.write(file)
|
||||
if not config_module.initialize_config(config_file):
|
||||
return
|
||||
|
||||
config = configparser.ConfigParser()
|
||||
@ -83,7 +59,6 @@ def main() -> None:
|
||||
driver.get("https://serviceportal.hamburg.de/HamburgGateway/Service/StartService/ALLMAnd")
|
||||
driver.get(f"{base_url}/si012.asp")
|
||||
meetings = get_meetings(driver)
|
||||
fill_agendas_committees(driver, meetings)
|
||||
download_documents(driver, meetings, pdf_location, base_url, district)
|
||||
driver.close()
|
||||
|
||||
@ -108,26 +83,14 @@ def get_meetings(driver: webdriver.Firefox) -> List[meeting.Meeting]:
|
||||
agenda_link = tds[4].find_element_by_tag_name("a").get_property("href")
|
||||
name = tds[4].find_element_by_tag_name("a").text
|
||||
location = tds[5].text
|
||||
meetings.append(meeting.Meeting(name, date_obj, time_obj, agenda_link, location, None))
|
||||
meetings.append(meeting.Meeting(name=name, date=date_obj,
|
||||
time=time_obj, end_time=None,
|
||||
link=agenda_link, location=location,
|
||||
agenda=None, address=None))
|
||||
|
||||
return meetings
|
||||
|
||||
|
||||
def fill_agendas_committees(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> None:
|
||||
notices_of_chair = "Mitteilungen der/des Vorsitzenden"
|
||||
notices_of_administration = "Mitteilungen der Verwaltung"
|
||||
motions = "Anträge / Vorlagen der Verwaltung"
|
||||
for _meeting in meetings:
|
||||
driver.get(_meeting.link)
|
||||
td = driver.find_element(By.XPATH, "//td[text()='" + notices_of_chair + "']")
|
||||
topChair = td.find_element(By.XPATH, '..').find_element(By.CSS_SELECTOR, 'td:first-child').find_element_by_tag_name("a").text
|
||||
td = driver.find_element(By.XPATH, "//td[text()='" + notices_of_administration + "']")
|
||||
topAdmin = td.find_element(By.XPATH, '..').find_element(By.CSS_SELECTOR, 'td:first-child').find_element_by_tag_name("a").text
|
||||
td = driver.find_element(By.XPATH, "//td[text()='" + motions + "']")
|
||||
topMotions = td.find_element(By.XPATH, '..').find_element(By.CSS_SELECTOR, 'td:first-child').find_element_by_tag_name("a").text
|
||||
pass
|
||||
|
||||
|
||||
def download_documents(driver: webdriver.Firefox, meetings: List[meeting.Meeting],
|
||||
pdf_location: str, base_url: str, district: str) -> None:
|
||||
base_link = f"{base_url}/do027.asp"
|
||||
|
||||
@ -25,6 +25,8 @@ class Meeting:
|
||||
name: str
|
||||
date: datetime.date
|
||||
time: datetime.time
|
||||
end_time: Optional[datetime.time]
|
||||
link: str
|
||||
location: str
|
||||
address: Optional[str]
|
||||
agenda: Optional[Agenda]
|
||||
|
||||
184
src/twomartens/allrisscraper/public.py
Normal file
184
src/twomartens/allrisscraper/public.py
Normal file
@ -0,0 +1,184 @@
|
||||
import configparser
|
||||
import dataclasses
|
||||
import json
|
||||
import os
|
||||
from datetime import date
|
||||
from datetime import time
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.firefox.webelement import FirefoxWebElement
|
||||
from selenium.webdriver.remote.webelement import WebElement
|
||||
|
||||
from twomartens.allrisscraper import agenda
|
||||
from twomartens.allrisscraper import config as config_module
|
||||
from twomartens.allrisscraper import definitions
|
||||
from twomartens.allrisscraper import custom_json
|
||||
from twomartens.allrisscraper import meeting
|
||||
from twomartens.allrisscraper.definitions import MONTHS
|
||||
from twomartens.allrisscraper.meeting import Meeting
|
||||
|
||||
|
||||
def main():
|
||||
config_file = f"{os.getcwd()}/tm-allris-scraper-config.ini"
|
||||
if not config_module.initialize_config(config_file):
|
||||
return
|
||||
|
||||
config = configparser.ConfigParser()
|
||||
config.read(config_file)
|
||||
district = config["Default"]["district"]
|
||||
json_path = config["Default"]["jsonLocation"]
|
||||
firefox_binary = config["Default"]["firefoxBinary"]
|
||||
base_url = definitions.PUBLIC_BASE_LINKS[district]
|
||||
|
||||
options = Options()
|
||||
options.headless = False
|
||||
binary = FirefoxBinary(firefox_binary)
|
||||
driver = webdriver.Firefox(firefox_binary=binary, options=options)
|
||||
driver.implicitly_wait(2)
|
||||
driver.get(f"{base_url}/si010_e.asp?MM=6&YY=2020")
|
||||
meetings = get_meetings(driver)
|
||||
process_agendas(driver, meetings)
|
||||
motions = get_motions(driver, meetings)
|
||||
driver.close()
|
||||
|
||||
os.makedirs(json_path, exist_ok=True)
|
||||
with open(json_path + "meetings.json", "w") as file:
|
||||
json.dump(meetings, file,
|
||||
cls=custom_json.EnhancedJSONEncoder)
|
||||
with open(json_path + "motions.json", "w") as file:
|
||||
json.dump(motions, file,
|
||||
cls=custom_json.EnhancedJSONEncoder)
|
||||
|
||||
|
||||
def get_meetings(driver: webdriver):
|
||||
year_month: str = str(driver.find_element(By.XPATH, "//table[@class='risdeco']//table[1]//tr").text).strip()
|
||||
month, year = year_month.split(" ")
|
||||
calendar_lines = driver.find_elements(
|
||||
By.XPATH,
|
||||
"//table[@class='tl1']//tr[not(descendant::td[contains(@colspan, '8')])]"
|
||||
)
|
||||
meetings = list()
|
||||
calendar_lines.remove(calendar_lines[0])
|
||||
for line in calendar_lines:
|
||||
last_date = None
|
||||
if len(meetings):
|
||||
last_meeting = meetings[-1]
|
||||
last_date = last_meeting.date
|
||||
meetings.append(get_meeting(line, month, year, last_date))
|
||||
return meetings
|
||||
|
||||
|
||||
def get_meeting(line: FirefoxWebElement, month: str, year: str, last_date: date) -> Meeting:
|
||||
tds = line.find_elements(By.XPATH, "td")
|
||||
date_str: str = str(tds[1].text).strip()
|
||||
if date_str:
|
||||
date_obj = date(int(year), MONTHS.get(month), int(date_str))
|
||||
else:
|
||||
date_obj = last_date
|
||||
start_time, end_time = str(tds[2].text).strip().split(" - ")
|
||||
start_time_obj = time.fromisoformat(start_time)
|
||||
end_time_obj = time.fromisoformat(end_time)
|
||||
name = str(tds[5].find_element_by_tag_name("a").text)
|
||||
agenda_link = str(tds[5].find_element_by_tag_name("a").get_property("href"))
|
||||
location = str(tds[8].text)
|
||||
|
||||
return meeting.Meeting(name=name, date=date_obj,
|
||||
time=start_time_obj, end_time=end_time_obj,
|
||||
link=agenda_link, location=location,
|
||||
agenda=None, address=None)
|
||||
|
||||
|
||||
def process_agendas(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> None:
|
||||
for meeting_obj in meetings:
|
||||
process_agenda(driver, meeting_obj)
|
||||
|
||||
|
||||
def process_agenda(driver: webdriver.Firefox, meeting_obj: meeting.Meeting) -> None:
|
||||
driver.get(meeting_obj.link)
|
||||
td = driver.find_element(By.XPATH, "//table[@class='risdeco']//tr[2]//td[2]")
|
||||
tables = td.find_elements(By.XPATH, "table")
|
||||
meta_table = tables[0]
|
||||
agenda_table = tables[1]
|
||||
meta_trs = meta_table.find_elements(By.XPATH, "./tbody//tr//td[1]//tr")
|
||||
meeting_obj.address = str(meta_trs[5].find_element(By.XPATH, "td[2]").text)
|
||||
|
||||
agenda_item_trs = agenda_table.find_elements(
|
||||
By.XPATH,
|
||||
".//tr[not(descendant::th) and not(descendant::td[contains(@colspan, '7')])]")
|
||||
agenda_item_trs = agenda_item_trs[:-1]
|
||||
|
||||
agenda_items = list()
|
||||
for index, agenda_item_tr in enumerate(agenda_item_trs):
|
||||
agenda_items.append(process_agenda_item(index, agenda_item_tr))
|
||||
meeting_obj.agenda = agenda.Agenda(agenda_items)
|
||||
|
||||
|
||||
def process_agenda_item(index: int, item: WebElement) -> agenda.AgendaItem:
|
||||
tds = item.find_elements(By.XPATH, "td")
|
||||
item_link = str(tds[0].find_element_by_tag_name("a").get_property("href")).strip()
|
||||
number = str(tds[0].find_element_by_tag_name("a").text).strip()
|
||||
name = str(tds[3].text).strip()
|
||||
public = "Ö" in number
|
||||
motion_td = str(tds[5].text).strip()
|
||||
has_motion = len(motion_td) != 0
|
||||
motion_link = None
|
||||
motion_reference = None
|
||||
if has_motion:
|
||||
motion_link = str(tds[5].find_element_by_tag_name("a").get_property("href")).strip()
|
||||
motion_reference = str(tds[5].find_element_by_tag_name("a").text).strip()
|
||||
|
||||
return agenda.AgendaItem(number=number, order=index, name=name,
|
||||
public=public, link=item_link,
|
||||
motion_link=motion_link, motion_reference=motion_reference)
|
||||
|
||||
|
||||
def get_motions(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> Dict[str, agenda.Motion]:
|
||||
motions: Dict[str, agenda.Motion] = dict()
|
||||
for _meeting in meetings:
|
||||
agenda_items = _meeting.agenda.agendaItems
|
||||
for agenda_item in agenda_items:
|
||||
if agenda_item.motion_link is None:
|
||||
continue
|
||||
motions[agenda_item.motion_reference] = get_motion(driver, agenda_item.motion_link,
|
||||
agenda_item.motion_reference)
|
||||
return motions
|
||||
|
||||
|
||||
def get_motion(driver: webdriver.Firefox, link: str, reference: str) -> agenda.Motion:
|
||||
driver.get(link)
|
||||
meta_table = driver.find_element(By.XPATH, "//table[@class='risdeco']//tr[2]//td[2]//table//tr//td[1]//table")
|
||||
meta_trs = meta_table.find_elements(By.XPATH, "./tbody//tr")
|
||||
name = str(meta_trs[0].find_element(By.XPATH, "td[2]").text).strip()
|
||||
motion_type = str(meta_trs[1].find_element(By.XPATH, "td[4]").text).strip()
|
||||
under_direction_of = str(meta_trs[2].find_element(By.XPATH, "td[2]").text).strip()
|
||||
|
||||
text_divs = driver.find_elements(By.XPATH, "//table[@class='risdeco']//tr[2]//td[2]//div")
|
||||
context_div = text_divs[0]
|
||||
context_ps = context_div.find_elements_by_tag_name("p")[1:-1]
|
||||
context = ""
|
||||
for p in context_ps:
|
||||
if len(context) > 0:
|
||||
context += "\n"
|
||||
context += str(p.text).strip()
|
||||
|
||||
petition_div = text_divs[1]
|
||||
petition_ps = petition_div.find_elements_by_tag_name("p")[1:-1]
|
||||
petition = ""
|
||||
for p in petition_ps:
|
||||
if len(petition) > 0:
|
||||
petition += "\n"
|
||||
petition += str(p.text).strip()
|
||||
petition.rstrip()
|
||||
|
||||
return agenda.Motion(name=name, reference=reference,
|
||||
type=motion_type, underDirectionOf=under_direction_of,
|
||||
context=context, petition=petition)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user