Fixed remaining issues with storing organization data

This commit is contained in:
2020-07-05 21:38:17 +02:00
parent c2aeb93b67
commit 7fded5f09d
8 changed files with 157 additions and 134 deletions

View File

@ -14,7 +14,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from dataclasses import dataclass
from typing import Dict from typing import Dict
from typing import List from typing import List
from typing import Optional from typing import Optional
@ -24,46 +23,9 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement from selenium.webdriver.remote.webelement import WebElement
from twomartens.allrisscraper import meeting from twomartens.allrisscraper import meeting
from twomartens.allrisscraper.public import XPATH_2ND_TD from twomartens.allrisscraper import data_types as types
XPATH_2ND_TD = "td[2]"
@dataclass
class Consultation:
authoritative: bool
agenda_item: str
meeting: str
organization: List[str]
role: str
result: str
@dataclass
class Motion:
consultations: List[Consultation]
context: str
file: str
name: str
reference: str
petition: str
type: str
under_direction_of: str
@dataclass
class AgendaItem:
number: str
order: int
name: str
public: bool
link: str
motion_link: str
motion_reference: str
resolution_text: str
@dataclass
class Agenda:
agenda_items: List[AgendaItem]
def process_agendas(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> None: def process_agendas(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> None:
@ -88,10 +50,10 @@ def process_agenda(driver: webdriver.Firefox, meeting_obj: meeting.Meeting) -> N
agenda_items = list() agenda_items = list()
for index, agenda_item_tr in enumerate(agenda_item_trs): for index, agenda_item_tr in enumerate(agenda_item_trs):
agenda_items.append(process_agenda_item(index, agenda_item_tr)) agenda_items.append(process_agenda_item(index, agenda_item_tr))
meeting_obj.agenda = Agenda(agenda_items) meeting_obj.agenda = types.Agenda(agenda_items)
def process_agenda_item(index: int, item: WebElement) -> AgendaItem: def process_agenda_item(index: int, item: WebElement) -> types.AgendaItem:
tds = item.find_elements_by_xpath("td") tds = item.find_elements_by_xpath("td")
item_link = str(tds[0].find_element_by_tag_name("a").get_property("href")).strip() item_link = str(tds[0].find_element_by_tag_name("a").get_property("href")).strip()
number = str(tds[0].find_element_by_tag_name("a").text).strip() number = str(tds[0].find_element_by_tag_name("a").text).strip()
@ -105,14 +67,14 @@ def process_agenda_item(index: int, item: WebElement) -> AgendaItem:
motion_link = str(tds[5].find_element_by_tag_name("a").get_property("href")).strip() motion_link = str(tds[5].find_element_by_tag_name("a").get_property("href")).strip()
motion_reference = str(tds[5].find_element_by_tag_name("a").text).strip() motion_reference = str(tds[5].find_element_by_tag_name("a").text).strip()
return AgendaItem(number=number, order=index, name=name, return types.AgendaItem(number=number, order=index, name=name,
public=public, link=item_link, public=public, link=item_link,
motion_link=motion_link, motion_reference=motion_reference, motion_link=motion_link, motion_reference=motion_reference,
resolution_text="") resolution_text="")
def get_motions(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> Dict[str, Motion]: def get_motions(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> Dict[str, types.Motion]:
motions: Dict[str, Motion] = dict() motions: Dict[str, types.Motion] = dict()
for _meeting in meetings: for _meeting in meetings:
agenda_items = _meeting.agenda.agenda_items agenda_items = _meeting.agenda.agenda_items
for agenda_item in agenda_items: for agenda_item in agenda_items:
@ -124,7 +86,7 @@ def get_motions(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> D
return motions return motions
def get_motion(driver: webdriver.Firefox, agenda_item_link: str, link: str, reference: str) -> Motion: def get_motion(driver: webdriver.Firefox, agenda_item_link: str, link: str, reference: str) -> types.Motion:
driver.get(link) driver.get(link)
meta_table = driver.find_element_by_xpath("//table[@class='risdeco']//tr[2]//td[2]//table//tr//td[1]//table") meta_table = driver.find_element_by_xpath("//table[@class='risdeco']//tr[2]//td[2]//table//tr//td[1]//table")
meta_trs = meta_table.find_elements_by_xpath("./tbody//tr") meta_trs = meta_table.find_elements_by_xpath("./tbody//tr")
@ -140,12 +102,18 @@ def get_motion(driver: webdriver.Firefox, agenda_item_link: str, link: str, refe
is_organization_header = tds[1].get_attribute("class") == "text1" is_organization_header = tds[1].get_attribute("class") == "text1"
if is_organization_header: if is_organization_header:
current_organization = str(tds[1].text).strip() current_organization = str(tds[1].text).strip()
if len(tds) >= 3:
current_role = str(tds[2].text).strip() current_role = str(tds[2].text).strip()
else:
current_role = None
else: else:
authoritative = str(tds[0].get_property("title")).strip() == "Erledigt" \ authoritative = str(tds[0].get_property("title")).strip() == "Erledigt" \
and str(tds[4].text).strip() in ["beschlossen", "zur Kenntnis genommen"] and str(tds[4].text).strip() in ["beschlossen", "zur Kenntnis genommen", "abgelehnt"]
link_exists = len(tds[3].find_elements_by_xpath("a")) > 0
if not link_exists:
continue
meeting_link = str(tds[3].find_element_by_xpath("a").get_property("href")).strip() meeting_link = str(tds[3].find_element_by_xpath("a").get_property("href")).strip()
consultations.append(Consultation( consultations.append(types.Consultation(
authoritative=authoritative, meeting=meeting_link, authoritative=authoritative, meeting=meeting_link,
organization=[current_organization], role=current_role, organization=[current_organization], role=current_role,
agenda_item=agenda_item_link, result=str(tds[2].text).strip() agenda_item=agenda_item_link, result=str(tds[2].text).strip()
@ -181,7 +149,7 @@ def get_motion(driver: webdriver.Firefox, agenda_item_link: str, link: str, refe
petition += str(p.text).strip() petition += str(p.text).strip()
petition.rstrip() petition.rstrip()
return Motion(name=name, reference=reference, return types.Motion(name=name, reference=reference,
type=motion_type, under_direction_of=under_direction_of, type=motion_type, under_direction_of=under_direction_of,
context=context, petition=petition, consultations=consultations, context=context, petition=petition, consultations=consultations,
file=file_link) file=file_link)

View File

@ -0,0 +1,65 @@
from dataclasses import dataclass
from typing import List
@dataclass
class Consultation:
authoritative: bool
agenda_item: str
meeting: str
organization: List[str]
role: str
result: str
@dataclass
class Motion:
consultations: List[Consultation]
context: str
file: str
name: str
reference: str
petition: str
type: str
under_direction_of: str
@dataclass
class AgendaItem:
number: str
order: int
name: str
public: bool
link: str
motion_link: str
motion_reference: str
resolution_text: str
@dataclass
class Agenda:
agenda_items: List[AgendaItem]
@dataclass
class Membership:
person: str
organization: str
role: str
on_behalf_of: str
@dataclass
class Organization:
classification: str
membership: List[Membership]
name: str
organization_type: str
@dataclass
class Person:
name: str
form_of_address: str
phone: List[str]
email: List[str]

View File

@ -13,7 +13,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import argparse
import configparser import configparser
import os import os
from datetime import date from datetime import date
@ -33,7 +32,7 @@ from twomartens.allrisscraper import meeting
from twomartens.allrisscraper.definitions import ALLRIS_LOGIN from twomartens.allrisscraper.definitions import ALLRIS_LOGIN
def main(args: argparse.Namespace) -> None: def main(_) -> None:
config_file = f"{os.getcwd()}/tm-allris-scraper-config.ini" config_file = f"{os.getcwd()}/tm-allris-scraper-config.ini"
if not config_module.initialize_config(config_file): if not config_module.initialize_config(config_file):
return return
@ -207,7 +206,3 @@ def get_abbreviation(name):
else: else:
abbreviated_name = f"{abbreviated_name}{part[:1].capitalize()}" abbreviated_name = f"{abbreviated_name}{part[:1].capitalize()}"
return abbreviated_name return abbreviated_name
if __name__ == "__main__":
main()

View File

@ -9,12 +9,13 @@ def main():
subparsers = parser.add_subparsers(help="sub-command help", required=True) subparsers = parser.add_subparsers(help="sub-command help", required=True)
oparl_parser = subparsers.add_parser("oparl", help="scrapes the public website") oparl_parser = subparsers.add_parser("oparl", help="scrapes the public website")
oparl_parser.add_argument("--include-organizations", action="store_true", dest="include_organizations") oparl_parser.add_argument("--include-organizations", action="store_true", dest="include_organizations")
oparl_parser.add_argument("--include-meetings", action="store_true", dest="include_meetings")
oparl_parser.set_defaults(function=public.main) oparl_parser.set_defaults(function=public.main)
internal_parser = subparsers.add_parser("internal", help="scrapes the internal website") internal_parser = subparsers.add_parser("internal", help="scrapes the internal website")
internal_parser.set_defaults(function=internal.main) internal_parser.set_defaults(function=internal.main)
args = parser.parse_args() args = parser.parse_args()
args.func(args) args.function(args)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -23,7 +23,7 @@ from selenium import webdriver
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.webelement import FirefoxWebElement from selenium.webdriver.firefox.webelement import FirefoxWebElement
from twomartens.allrisscraper.agenda import Agenda from twomartens.allrisscraper import data_types
from twomartens.allrisscraper.definitions import MONTHS from twomartens.allrisscraper.definitions import MONTHS
@ -36,7 +36,7 @@ class Meeting:
link: str link: str
location: str location: str
address: Optional[str] address: Optional[str]
agenda: Optional[Agenda] agenda: Optional[data_types.Agenda]
def get_meetings(driver: webdriver, base_url: str): def get_meetings(driver: webdriver, base_url: str):

View File

@ -1,27 +1,12 @@
from dataclasses import dataclass
from typing import List from typing import List
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement from selenium.webdriver.remote.webelement import WebElement
from twomartens.allrisscraper import data_types as types
@dataclass
class Membership:
person: str
organization: str
role: str
on_behalf_of: str
@dataclass def get_organizations(driver: webdriver.Firefox, base_url: str) -> List[types.Organization]:
class Organization:
classification: str
membership: List[Membership]
name: str
organization_type: str
def get_organizations(driver: webdriver.Firefox, base_url: str) -> List[Organization]:
organizations = [get_organization(driver=driver, organizations = [get_organization(driver=driver,
link=f"{base_url}/pa021.asp", link=f"{base_url}/pa021.asp",
classification="Bezirksversammlung", classification="Bezirksversammlung",
@ -34,7 +19,7 @@ def get_organizations(driver: webdriver.Firefox, base_url: str) -> List[Organiza
return organizations return organizations
def get_committees(driver: webdriver.Firefox, link: str) -> List[Organization]: def get_committees(driver: webdriver.Firefox, link: str) -> List[types.Organization]:
driver.get(link) driver.get(link)
committee_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table//tr[not(contains(@class, 'zw1'))]")[2:-1] committee_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table//tr[not(contains(@class, 'zw1'))]")[2:-1]
organizations = [] organizations = []
@ -52,7 +37,7 @@ def get_committees(driver: webdriver.Firefox, link: str) -> List[Organization]:
return organizations return organizations
def get_factions(driver: webdriver.Firefox, link: str) -> List[Organization]: def get_factions(driver: webdriver.Firefox, link: str) -> List[types.Organization]:
driver.get(link) driver.get(link)
driver.get(link) driver.get(link)
faction_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table//tr")[2:-1] faction_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table//tr")[2:-1]
@ -71,22 +56,26 @@ def get_factions(driver: webdriver.Firefox, link: str) -> List[Organization]:
return organizations return organizations
def get_organization(driver: webdriver.Firefox, link: str, classification: str, organization_type: str) -> Organization: def get_organization(driver: webdriver.Firefox, link: str, classification: str,
organization_type: str) -> types.Organization:
driver.get(link) driver.get(link)
name = str(driver.find_element_by_xpath("//div[@id='risname']").text) name = str(driver.find_element_by_xpath("//div[@id='risname']").text).strip()
memberships = [] memberships = []
member_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table//tr")[2:-1] member_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table[2]//tr")[2:-1]
for member_tr in member_trs: for member_tr in member_trs:
memberships.append(get_membership(member_tr, name)) memberships.append(get_membership(member_tr, name))
return Organization(name=name, classification=classification, return types.Organization(name=name, classification=classification,
organization_type=organization_type, membership=memberships) organization_type=organization_type, membership=memberships)
def get_membership(member_tr: WebElement, organization: str) -> Membership: def get_membership(member_tr: WebElement, organization: str) -> types.Membership:
tds = member_tr.find_elements_by_xpath("td") tds = member_tr.find_elements_by_xpath("td")
if len(tds[2].find_elements_by_xpath("a")) == 0:
person_link = ""
else:
person_link = str(tds[2].find_element_by_xpath("a").get_property("href")).strip() person_link = str(tds[2].find_element_by_xpath("a").get_property("href")).strip()
role = str(tds[3].text).strip() role = str(tds[3].text).strip()
on_behalf_of = str(tds[4].text).strip() on_behalf_of = str(tds[4].text).strip()
return Membership(person=person_link, organization=organization, role=role, on_behalf_of=on_behalf_of) return types.Membership(person=person_link, organization=organization, role=role, on_behalf_of=on_behalf_of)

View File

@ -1,45 +1,53 @@
from dataclasses import dataclass
from typing import Dict from typing import Dict
from typing import List from typing import List
from selenium import webdriver from selenium import webdriver
from twomartens.allrisscraper.organization import Organization from twomartens.allrisscraper import data_types as types
@dataclass def get_persons(driver: webdriver.Firefox, organizations: List[types.Organization]) -> List[types.Person]:
class Person: persons: Dict[str, types.Person] = {}
name: str
form_of_address: str
phone: List[str]
email: List[str]
def get_persons(driver: webdriver.Firefox, organizations: List[Organization]) -> List[Person]:
persons: Dict[str, Person] = {}
for org in organizations: for org in organizations:
memberships = org.membership memberships = org.membership
for membership in memberships: for membership in memberships:
person_link = membership.person person_link = membership.person
if person_link in persons: if person_link in persons:
continue continue
if person_link == "":
continue
persons[person_link] = get_person(driver=driver, link=person_link) persons[person_link] = get_person(driver=driver, link=person_link)
return list(persons.values()) return list(persons.values())
def get_person(driver: webdriver.Firefox, link: str) -> Person: def get_person(driver: webdriver.Firefox, link: str) -> types.Person:
driver.get(link) driver.get(link)
meta_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table//tr//td//table//tr") meta_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table[1]//tr//td//table//tr")
form_of_address = str(meta_trs[0].find_element_by_xpath("td[3]").text).strip() form_of_address = str(meta_trs[0].find_element_by_xpath("td[3]").text).strip()
name = str(meta_trs[1].find_element_by_xpath("td").text).strip() name = str(meta_trs[1].find_element_by_xpath("td").text).strip()
phone_tds = meta_trs[5].find_elements_by_xpath("td") phone = []
phone = "" email = []
if len(phone_tds) > 1: additional_trs = meta_trs[2:]
phone = str(meta_trs[5].find_element_by_xpath("td[2]//span").text).strip() for tr in additional_trs:
email_tds = meta_trs[6].find_elements_by_xpath("td") tds = tr.find_elements_by_tag_name("td")
email = "" if len(tds) == 1 and str(tds[0].text).strip() == "":
if len(email_tds) > 1: continue
email = str(meta_trs[6].find_element_by_xpath("td[2]//a").text).strip() images = tr.find_elements_by_xpath("td[1]//img")
if len(images) == 0:
continue
alt = images[0].get_property("alt")
if is_email_row(alt):
email.append(str(tr.find_element_by_xpath("td[2]//a").text).strip())
if is_phone_row(alt):
phone.append(str(tr.find_element_by_xpath("td[2]//span").text).strip())
return Person(name=name, form_of_address=form_of_address, phone=[phone], email=[email]) return types.Person(name=name, form_of_address=form_of_address, phone=phone, email=email)
def is_email_row(alt: str) -> bool:
return "eMail" in alt
def is_phone_row(alt: str) -> bool:
return "Tel" in alt

View File

@ -15,8 +15,6 @@ from twomartens.allrisscraper import meeting
from twomartens.allrisscraper import organization from twomartens.allrisscraper import organization
from twomartens.allrisscraper import person from twomartens.allrisscraper import person
XPATH_2ND_TD = "td[2]"
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
config_file = f"{os.getcwd()}/tm-allris-scraper-config.ini" config_file = f"{os.getcwd()}/tm-allris-scraper-config.ini"
@ -35,27 +33,26 @@ def main(args: argparse.Namespace):
binary = FirefoxBinary(firefox_binary) binary = FirefoxBinary(firefox_binary)
driver = webdriver.Firefox(firefox_binary=binary, options=options) driver = webdriver.Firefox(firefox_binary=binary, options=options)
driver.implicitly_wait(2) driver.implicitly_wait(2)
os.makedirs(json_path, exist_ok=True)
if args.include_meetings:
meetings = meeting.get_meetings(driver, base_url) meetings = meeting.get_meetings(driver, base_url)
agenda.process_agendas(driver, meetings) agenda.process_agendas(driver, meetings)
motions = agenda.get_motions(driver, meetings) motions = agenda.get_motions(driver, meetings)
organizations = []
persons = []
if args.include_organizations:
organizations = organization.get_organizations(driver, base_url)
persons = person.get_persons(driver, organizations)
driver.close()
os.makedirs(json_path, exist_ok=True)
with open(json_path + "meetings.json", "w") as file: with open(json_path + "meetings.json", "w") as file:
json.dump(meetings, file, json.dump(meetings, file,
cls=custom_json.EnhancedJSONEncoder) cls=custom_json.EnhancedJSONEncoder)
with open(json_path + "motions.json", "w") as file: with open(json_path + "motions.json", "w") as file:
json.dump(motions, file, json.dump(motions, file,
cls=custom_json.EnhancedJSONEncoder) cls=custom_json.EnhancedJSONEncoder)
if args.include_organizations: if args.include_organizations:
organizations = organization.get_organizations(driver, base_url)
persons = person.get_persons(driver, organizations)
with open(json_path + "organizations.json", "w") as file: with open(json_path + "organizations.json", "w") as file:
json.dump(organizations, file, json.dump(organizations, file,
cls=custom_json.EnhancedJSONEncoder) cls=custom_json.EnhancedJSONEncoder)
with open(json_path + "persons.json", "w") as file: with open(json_path + "persons.json", "w") as file:
json.dump(persons, file, json.dump(persons, file,
cls=custom_json.EnhancedJSONEncoder) cls=custom_json.EnhancedJSONEncoder)
driver.close()