From 7fded5f09dd0c261f86ba34117378a14e494827a Mon Sep 17 00:00:00 2001 From: Jim Martens Date: Sun, 5 Jul 2020 21:38:17 +0200 Subject: [PATCH] Fixed remaining issues with storing organization data --- src/twomartens/allrisscraper/agenda.py | 80 ++++++-------------- src/twomartens/allrisscraper/data_types.py | 65 ++++++++++++++++ src/twomartens/allrisscraper/internal.py | 7 +- src/twomartens/allrisscraper/main.py | 3 +- src/twomartens/allrisscraper/meeting.py | 4 +- src/twomartens/allrisscraper/organization.py | 45 +++++------ src/twomartens/allrisscraper/person.py | 56 ++++++++------ src/twomartens/allrisscraper/public.py | 31 ++++---- 8 files changed, 157 insertions(+), 134 deletions(-) create mode 100644 src/twomartens/allrisscraper/data_types.py diff --git a/src/twomartens/allrisscraper/agenda.py b/src/twomartens/allrisscraper/agenda.py index 41c317a..1a52caf 100644 --- a/src/twomartens/allrisscraper/agenda.py +++ b/src/twomartens/allrisscraper/agenda.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass from typing import Dict from typing import List from typing import Optional @@ -24,46 +23,9 @@ from selenium.webdriver.common.by import By from selenium.webdriver.remote.webelement import WebElement from twomartens.allrisscraper import meeting -from twomartens.allrisscraper.public import XPATH_2ND_TD +from twomartens.allrisscraper import data_types as types - -@dataclass -class Consultation: - authoritative: bool - agenda_item: str - meeting: str - organization: List[str] - role: str - result: str - - -@dataclass -class Motion: - consultations: List[Consultation] - context: str - file: str - name: str - reference: str - petition: str - type: str - under_direction_of: str - - -@dataclass -class AgendaItem: - number: str - order: int - name: str - public: bool - link: str - motion_link: str - motion_reference: str - resolution_text: str - - -@dataclass -class Agenda: - agenda_items: List[AgendaItem] +XPATH_2ND_TD = "td[2]" def process_agendas(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> None: @@ -88,10 +50,10 @@ def process_agenda(driver: webdriver.Firefox, meeting_obj: meeting.Meeting) -> N agenda_items = list() for index, agenda_item_tr in enumerate(agenda_item_trs): agenda_items.append(process_agenda_item(index, agenda_item_tr)) - meeting_obj.agenda = Agenda(agenda_items) + meeting_obj.agenda = types.Agenda(agenda_items) -def process_agenda_item(index: int, item: WebElement) -> AgendaItem: +def process_agenda_item(index: int, item: WebElement) -> types.AgendaItem: tds = item.find_elements_by_xpath("td") item_link = str(tds[0].find_element_by_tag_name("a").get_property("href")).strip() number = str(tds[0].find_element_by_tag_name("a").text).strip() @@ -105,14 +67,14 @@ def process_agenda_item(index: int, item: WebElement) -> AgendaItem: motion_link = str(tds[5].find_element_by_tag_name("a").get_property("href")).strip() motion_reference = str(tds[5].find_element_by_tag_name("a").text).strip() - return AgendaItem(number=number, order=index, name=name, - public=public, link=item_link, - motion_link=motion_link, motion_reference=motion_reference, - resolution_text="") + return types.AgendaItem(number=number, order=index, name=name, + public=public, link=item_link, + motion_link=motion_link, motion_reference=motion_reference, + resolution_text="") -def get_motions(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> Dict[str, Motion]: - motions: Dict[str, Motion] = dict() +def get_motions(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> Dict[str, types.Motion]: + motions: Dict[str, types.Motion] = dict() for _meeting in meetings: agenda_items = _meeting.agenda.agenda_items for agenda_item in agenda_items: @@ -124,7 +86,7 @@ def get_motions(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> D return motions -def get_motion(driver: webdriver.Firefox, agenda_item_link: str, link: str, reference: str) -> Motion: +def get_motion(driver: webdriver.Firefox, agenda_item_link: str, link: str, reference: str) -> types.Motion: driver.get(link) meta_table = driver.find_element_by_xpath("//table[@class='risdeco']//tr[2]//td[2]//table//tr//td[1]//table") meta_trs = meta_table.find_elements_by_xpath("./tbody//tr") @@ -140,12 +102,18 @@ def get_motion(driver: webdriver.Firefox, agenda_item_link: str, link: str, refe is_organization_header = tds[1].get_attribute("class") == "text1" if is_organization_header: current_organization = str(tds[1].text).strip() - current_role = str(tds[2].text).strip() + if len(tds) >= 3: + current_role = str(tds[2].text).strip() + else: + current_role = None else: authoritative = str(tds[0].get_property("title")).strip() == "Erledigt" \ - and str(tds[4].text).strip() in ["beschlossen", "zur Kenntnis genommen"] + and str(tds[4].text).strip() in ["beschlossen", "zur Kenntnis genommen", "abgelehnt"] + link_exists = len(tds[3].find_elements_by_xpath("a")) > 0 + if not link_exists: + continue meeting_link = str(tds[3].find_element_by_xpath("a").get_property("href")).strip() - consultations.append(Consultation( + consultations.append(types.Consultation( authoritative=authoritative, meeting=meeting_link, organization=[current_organization], role=current_role, agenda_item=agenda_item_link, result=str(tds[2].text).strip() @@ -181,7 +149,7 @@ def get_motion(driver: webdriver.Firefox, agenda_item_link: str, link: str, refe petition += str(p.text).strip() petition.rstrip() - return Motion(name=name, reference=reference, - type=motion_type, under_direction_of=under_direction_of, - context=context, petition=petition, consultations=consultations, - file=file_link) + return types.Motion(name=name, reference=reference, + type=motion_type, under_direction_of=under_direction_of, + context=context, petition=petition, consultations=consultations, + file=file_link) diff --git a/src/twomartens/allrisscraper/data_types.py b/src/twomartens/allrisscraper/data_types.py new file mode 100644 index 0000000..471c4f8 --- /dev/null +++ b/src/twomartens/allrisscraper/data_types.py @@ -0,0 +1,65 @@ +from dataclasses import dataclass +from typing import List + + +@dataclass +class Consultation: + authoritative: bool + agenda_item: str + meeting: str + organization: List[str] + role: str + result: str + + +@dataclass +class Motion: + consultations: List[Consultation] + context: str + file: str + name: str + reference: str + petition: str + type: str + under_direction_of: str + + +@dataclass +class AgendaItem: + number: str + order: int + name: str + public: bool + link: str + motion_link: str + motion_reference: str + resolution_text: str + + +@dataclass +class Agenda: + agenda_items: List[AgendaItem] + + +@dataclass +class Membership: + person: str + organization: str + role: str + on_behalf_of: str + + +@dataclass +class Organization: + classification: str + membership: List[Membership] + name: str + organization_type: str + + +@dataclass +class Person: + name: str + form_of_address: str + phone: List[str] + email: List[str] diff --git a/src/twomartens/allrisscraper/internal.py b/src/twomartens/allrisscraper/internal.py index 8633b7f..d2b5d8f 100644 --- a/src/twomartens/allrisscraper/internal.py +++ b/src/twomartens/allrisscraper/internal.py @@ -13,7 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import argparse import configparser import os from datetime import date @@ -33,7 +32,7 @@ from twomartens.allrisscraper import meeting from twomartens.allrisscraper.definitions import ALLRIS_LOGIN -def main(args: argparse.Namespace) -> None: +def main(_) -> None: config_file = f"{os.getcwd()}/tm-allris-scraper-config.ini" if not config_module.initialize_config(config_file): return @@ -207,7 +206,3 @@ def get_abbreviation(name): else: abbreviated_name = f"{abbreviated_name}{part[:1].capitalize()}" return abbreviated_name - - -if __name__ == "__main__": - main() diff --git a/src/twomartens/allrisscraper/main.py b/src/twomartens/allrisscraper/main.py index d487cba..1ca53b9 100644 --- a/src/twomartens/allrisscraper/main.py +++ b/src/twomartens/allrisscraper/main.py @@ -9,12 +9,13 @@ def main(): subparsers = parser.add_subparsers(help="sub-command help", required=True) oparl_parser = subparsers.add_parser("oparl", help="scrapes the public website") oparl_parser.add_argument("--include-organizations", action="store_true", dest="include_organizations") + oparl_parser.add_argument("--include-meetings", action="store_true", dest="include_meetings") oparl_parser.set_defaults(function=public.main) internal_parser = subparsers.add_parser("internal", help="scrapes the internal website") internal_parser.set_defaults(function=internal.main) args = parser.parse_args() - args.func(args) + args.function(args) if __name__ == "__main__": diff --git a/src/twomartens/allrisscraper/meeting.py b/src/twomartens/allrisscraper/meeting.py index 3229e52..4fda237 100644 --- a/src/twomartens/allrisscraper/meeting.py +++ b/src/twomartens/allrisscraper/meeting.py @@ -23,7 +23,7 @@ from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.firefox.webelement import FirefoxWebElement -from twomartens.allrisscraper.agenda import Agenda +from twomartens.allrisscraper import data_types from twomartens.allrisscraper.definitions import MONTHS @@ -36,7 +36,7 @@ class Meeting: link: str location: str address: Optional[str] - agenda: Optional[Agenda] + agenda: Optional[data_types.Agenda] def get_meetings(driver: webdriver, base_url: str): diff --git a/src/twomartens/allrisscraper/organization.py b/src/twomartens/allrisscraper/organization.py index b1f3f2d..77090b0 100644 --- a/src/twomartens/allrisscraper/organization.py +++ b/src/twomartens/allrisscraper/organization.py @@ -1,27 +1,12 @@ -from dataclasses import dataclass from typing import List from selenium import webdriver from selenium.webdriver.remote.webelement import WebElement - -@dataclass -class Membership: - person: str - organization: str - role: str - on_behalf_of: str +from twomartens.allrisscraper import data_types as types -@dataclass -class Organization: - classification: str - membership: List[Membership] - name: str - organization_type: str - - -def get_organizations(driver: webdriver.Firefox, base_url: str) -> List[Organization]: +def get_organizations(driver: webdriver.Firefox, base_url: str) -> List[types.Organization]: organizations = [get_organization(driver=driver, link=f"{base_url}/pa021.asp", classification="Bezirksversammlung", @@ -34,7 +19,7 @@ def get_organizations(driver: webdriver.Firefox, base_url: str) -> List[Organiza return organizations -def get_committees(driver: webdriver.Firefox, link: str) -> List[Organization]: +def get_committees(driver: webdriver.Firefox, link: str) -> List[types.Organization]: driver.get(link) committee_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table//tr[not(contains(@class, 'zw1'))]")[2:-1] organizations = [] @@ -48,11 +33,11 @@ def get_committees(driver: webdriver.Firefox, link: str) -> List[Organization]: for link in links: organizations.append(get_organization(driver=driver, link=link, classification="Ausschuss", organization_type="Gremium")) - + return organizations -def get_factions(driver: webdriver.Firefox, link: str) -> List[Organization]: +def get_factions(driver: webdriver.Firefox, link: str) -> List[types.Organization]: driver.get(link) driver.get(link) faction_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table//tr")[2:-1] @@ -71,22 +56,26 @@ def get_factions(driver: webdriver.Firefox, link: str) -> List[Organization]: return organizations -def get_organization(driver: webdriver.Firefox, link: str, classification: str, organization_type: str) -> Organization: +def get_organization(driver: webdriver.Firefox, link: str, classification: str, + organization_type: str) -> types.Organization: driver.get(link) - name = str(driver.find_element_by_xpath("//div[@id='risname']").text) + name = str(driver.find_element_by_xpath("//div[@id='risname']").text).strip() memberships = [] - member_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table//tr")[2:-1] + member_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table[2]//tr")[2:-1] for member_tr in member_trs: memberships.append(get_membership(member_tr, name)) - return Organization(name=name, classification=classification, - organization_type=organization_type, membership=memberships) + return types.Organization(name=name, classification=classification, + organization_type=organization_type, membership=memberships) -def get_membership(member_tr: WebElement, organization: str) -> Membership: +def get_membership(member_tr: WebElement, organization: str) -> types.Membership: tds = member_tr.find_elements_by_xpath("td") - person_link = str(tds[2].find_element_by_xpath("a").get_property("href")).strip() + if len(tds[2].find_elements_by_xpath("a")) == 0: + person_link = "" + else: + person_link = str(tds[2].find_element_by_xpath("a").get_property("href")).strip() role = str(tds[3].text).strip() on_behalf_of = str(tds[4].text).strip() - return Membership(person=person_link, organization=organization, role=role, on_behalf_of=on_behalf_of) + return types.Membership(person=person_link, organization=organization, role=role, on_behalf_of=on_behalf_of) diff --git a/src/twomartens/allrisscraper/person.py b/src/twomartens/allrisscraper/person.py index 0b446c6..3fd86ea 100644 --- a/src/twomartens/allrisscraper/person.py +++ b/src/twomartens/allrisscraper/person.py @@ -1,45 +1,53 @@ -from dataclasses import dataclass from typing import Dict from typing import List from selenium import webdriver -from twomartens.allrisscraper.organization import Organization +from twomartens.allrisscraper import data_types as types -@dataclass -class Person: - name: str - form_of_address: str - phone: List[str] - email: List[str] - - -def get_persons(driver: webdriver.Firefox, organizations: List[Organization]) -> List[Person]: - persons: Dict[str, Person] = {} +def get_persons(driver: webdriver.Firefox, organizations: List[types.Organization]) -> List[types.Person]: + persons: Dict[str, types.Person] = {} for org in organizations: memberships = org.membership for membership in memberships: person_link = membership.person if person_link in persons: continue + if person_link == "": + continue persons[person_link] = get_person(driver=driver, link=person_link) - + return list(persons.values()) -def get_person(driver: webdriver.Firefox, link: str) -> Person: +def get_person(driver: webdriver.Firefox, link: str) -> types.Person: driver.get(link) - meta_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table//tr//td//table//tr") + meta_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table[1]//tr//td//table//tr") form_of_address = str(meta_trs[0].find_element_by_xpath("td[3]").text).strip() name = str(meta_trs[1].find_element_by_xpath("td").text).strip() - phone_tds = meta_trs[5].find_elements_by_xpath("td") - phone = "" - if len(phone_tds) > 1: - phone = str(meta_trs[5].find_element_by_xpath("td[2]//span").text).strip() - email_tds = meta_trs[6].find_elements_by_xpath("td") - email = "" - if len(email_tds) > 1: - email = str(meta_trs[6].find_element_by_xpath("td[2]//a").text).strip() + phone = [] + email = [] + additional_trs = meta_trs[2:] + for tr in additional_trs: + tds = tr.find_elements_by_tag_name("td") + if len(tds) == 1 and str(tds[0].text).strip() == "": + continue + images = tr.find_elements_by_xpath("td[1]//img") + if len(images) == 0: + continue + alt = images[0].get_property("alt") + if is_email_row(alt): + email.append(str(tr.find_element_by_xpath("td[2]//a").text).strip()) + if is_phone_row(alt): + phone.append(str(tr.find_element_by_xpath("td[2]//span").text).strip()) - return Person(name=name, form_of_address=form_of_address, phone=[phone], email=[email]) + return types.Person(name=name, form_of_address=form_of_address, phone=phone, email=email) + + +def is_email_row(alt: str) -> bool: + return "eMail" in alt + + +def is_phone_row(alt: str) -> bool: + return "Tel" in alt diff --git a/src/twomartens/allrisscraper/public.py b/src/twomartens/allrisscraper/public.py index 4b608e5..8c9a93d 100644 --- a/src/twomartens/allrisscraper/public.py +++ b/src/twomartens/allrisscraper/public.py @@ -15,8 +15,6 @@ from twomartens.allrisscraper import meeting from twomartens.allrisscraper import organization from twomartens.allrisscraper import person -XPATH_2ND_TD = "td[2]" - def main(args: argparse.Namespace): config_file = f"{os.getcwd()}/tm-allris-scraper-config.ini" @@ -35,27 +33,26 @@ def main(args: argparse.Namespace): binary = FirefoxBinary(firefox_binary) driver = webdriver.Firefox(firefox_binary=binary, options=options) driver.implicitly_wait(2) - meetings = meeting.get_meetings(driver, base_url) - agenda.process_agendas(driver, meetings) - motions = agenda.get_motions(driver, meetings) - organizations = [] - persons = [] + os.makedirs(json_path, exist_ok=True) + if args.include_meetings: + meetings = meeting.get_meetings(driver, base_url) + agenda.process_agendas(driver, meetings) + motions = agenda.get_motions(driver, meetings) + with open(json_path + "meetings.json", "w") as file: + json.dump(meetings, file, + cls=custom_json.EnhancedJSONEncoder) + with open(json_path + "motions.json", "w") as file: + json.dump(motions, file, + cls=custom_json.EnhancedJSONEncoder) + if args.include_organizations: organizations = organization.get_organizations(driver, base_url) persons = person.get_persons(driver, organizations) - driver.close() - - os.makedirs(json_path, exist_ok=True) - with open(json_path + "meetings.json", "w") as file: - json.dump(meetings, file, - cls=custom_json.EnhancedJSONEncoder) - with open(json_path + "motions.json", "w") as file: - json.dump(motions, file, - cls=custom_json.EnhancedJSONEncoder) - if args.include_organizations: with open(json_path + "organizations.json", "w") as file: json.dump(organizations, file, cls=custom_json.EnhancedJSONEncoder) with open(json_path + "persons.json", "w") as file: json.dump(persons, file, cls=custom_json.EnhancedJSONEncoder) + + driver.close()