allris-scraper/src/twomartens/allrisscraper/agenda.py

156 lines
6.8 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2020 Jim Martens
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict
from typing import List
from typing import Optional
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from twomartens.allrisscraper import meeting
from twomartens.allrisscraper import data_types as types
XPATH_2ND_TD = "td[2]"
def process_agendas(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> None:
for meeting_obj in meetings:
process_agenda(driver, meeting_obj)
def process_agenda(driver: webdriver.Firefox, meeting_obj: meeting.Meeting) -> None:
driver.get(meeting_obj.link)
td = driver.find_element_by_xpath("//table[@class='risdeco']//tr[2]//td[2]")
tables = td.find_elements_by_xpath("table")
meta_table = tables[0]
agenda_table = tables[1]
meta_trs = meta_table.find_elements_by_xpath("./tbody//tr//td[1]//tr")
meeting_obj.address = str(meta_trs[5].find_element_by_xpath(XPATH_2ND_TD).text)
agenda_item_trs = agenda_table.find_elements(
By.XPATH,
".//tr[not(descendant::th) and not(descendant::td[contains(@colspan, '7')])]")
agenda_item_trs = agenda_item_trs[:-1]
agenda_items = list()
for index, agenda_item_tr in enumerate(agenda_item_trs):
agenda_items.append(process_agenda_item(index, agenda_item_tr))
meeting_obj.agenda = types.Agenda(agenda_items)
def process_agenda_item(index: int, item: WebElement) -> types.AgendaItem:
tds = item.find_elements_by_xpath("td")
item_link = str(tds[0].find_element_by_tag_name("a").get_property("href")).strip()
number = str(tds[0].find_element_by_tag_name("a").text).strip()
name = str(tds[3].text).strip()
public = "Ö" in number
motion_td = str(tds[5].text).strip()
has_motion = len(motion_td) != 0
motion_link = None
motion_reference = None
if has_motion:
motion_link = str(tds[5].find_element_by_tag_name("a").get_property("href")).strip()
motion_reference = str(tds[5].find_element_by_tag_name("a").text).strip()
return types.AgendaItem(number=number, order=index, name=name,
public=public, link=item_link,
motion_link=motion_link, motion_reference=motion_reference,
resolution_text="")
def get_motions(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> Dict[str, types.Motion]:
motions: Dict[str, types.Motion] = dict()
for _meeting in meetings:
agenda_items = _meeting.agenda.agenda_items
for agenda_item in agenda_items:
if agenda_item.motion_link is None:
continue
motions[agenda_item.motion_reference] = get_motion(driver=driver, agenda_item_link=agenda_item.link,
link=agenda_item.motion_link,
reference=agenda_item.motion_reference)
return motions
def get_motion(driver: webdriver.Firefox, agenda_item_link: str, link: str, reference: str) -> types.Motion:
driver.get(link)
meta_table = driver.find_element_by_xpath("//table[@class='risdeco']//tr[2]//td[2]//table//tr//td[1]//table")
meta_trs = meta_table.find_elements_by_xpath("./tbody//tr")
name = str(meta_trs[0].find_element_by_xpath(XPATH_2ND_TD).text).strip()
motion_type = str(meta_trs[1].find_element_by_xpath("td[4]").text).strip()
under_direction_of = str(meta_trs[2].find_element_by_xpath(XPATH_2ND_TD).text).strip()
consultation_trs = meta_trs[4].find_elements_by_xpath(".//table//tr")[1:]
current_organization: Optional[str] = None
current_role: Optional[str] = None
consultations = []
for consultation_tr in consultation_trs:
tds = consultation_tr.find_elements_by_xpath("td")
is_organization_header = tds[1].get_attribute("class") == "text1"
if is_organization_header:
current_organization = str(tds[1].text).strip()
if len(tds) >= 3:
current_role = str(tds[2].text).strip()
else:
current_role = None
else:
authoritative = str(tds[0].get_property("title")).strip() == "Erledigt" \
and str(tds[4].text).strip() in ["beschlossen", "zur Kenntnis genommen", "abgelehnt"]
link_exists = len(tds[3].find_elements_by_xpath("a")) > 0
if not link_exists:
continue
meeting_link = str(tds[3].find_element_by_xpath("a").get_property("href")).strip()
consultations.append(types.Consultation(
authoritative=authoritative, meeting=meeting_link,
organization=[current_organization], role=current_role,
agenda_item=agenda_item_link, result=str(tds[2].text).strip()
))
file_table = driver.find_element_by_xpath("//table[@class='risdeco']//tr[2]//td[2]//table//tr//td[3]//table")
motion_file_form = file_table.find_element_by_xpath(".//tr[2]//td//form[1]")
hidden_inputs = motion_file_form.find_elements_by_xpath(".//input[contains(@type, 'hidden')]")
file_link = ""
for hidden_input in hidden_inputs:
if file_link == "":
file_link += "?"
else:
file_link += "&"
file_link += f"{hidden_input.get_property('name')}={hidden_input.get_property('value')}"
file_link = f"{motion_file_form.get_property('action')}{file_link}"
text_divs = driver.find_elements_by_xpath("//table[@class='risdeco']//tr[2]//td[2]//div")
context_div = text_divs[0]
context_ps = context_div.find_elements_by_xpath("p")[1:-1]
context = ""
for p in context_ps:
if len(context) > 0:
context += "\n"
context += str(p.text).strip()
petition_div = text_divs[1]
petition_ps = petition_div.find_elements_by_xpath("p")[1:-1]
petition = ""
for p in petition_ps:
if len(petition) > 0:
petition += "\n"
petition += str(p.text).strip()
petition.rstrip()
return types.Motion(name=name, reference=reference,
type=motion_type, under_direction_of=under_direction_of,
context=context, petition=petition, consultations=consultations,
file=file_link)