Fixed remaining issues with storing organization data
Cette révision appartient à :
Parent
c2aeb93b67
révision
7fded5f09d
|
@ -14,7 +14,6 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
|
@ -24,46 +23,9 @@ from selenium.webdriver.common.by import By
|
|||
from selenium.webdriver.remote.webelement import WebElement
|
||||
|
||||
from twomartens.allrisscraper import meeting
|
||||
from twomartens.allrisscraper.public import XPATH_2ND_TD
|
||||
from twomartens.allrisscraper import data_types as types
|
||||
|
||||
|
||||
@dataclass
|
||||
class Consultation:
|
||||
authoritative: bool
|
||||
agenda_item: str
|
||||
meeting: str
|
||||
organization: List[str]
|
||||
role: str
|
||||
result: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Motion:
|
||||
consultations: List[Consultation]
|
||||
context: str
|
||||
file: str
|
||||
name: str
|
||||
reference: str
|
||||
petition: str
|
||||
type: str
|
||||
under_direction_of: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class AgendaItem:
|
||||
number: str
|
||||
order: int
|
||||
name: str
|
||||
public: bool
|
||||
link: str
|
||||
motion_link: str
|
||||
motion_reference: str
|
||||
resolution_text: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Agenda:
|
||||
agenda_items: List[AgendaItem]
|
||||
XPATH_2ND_TD = "td[2]"
|
||||
|
||||
|
||||
def process_agendas(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> None:
|
||||
|
@ -88,10 +50,10 @@ def process_agenda(driver: webdriver.Firefox, meeting_obj: meeting.Meeting) -> N
|
|||
agenda_items = list()
|
||||
for index, agenda_item_tr in enumerate(agenda_item_trs):
|
||||
agenda_items.append(process_agenda_item(index, agenda_item_tr))
|
||||
meeting_obj.agenda = Agenda(agenda_items)
|
||||
meeting_obj.agenda = types.Agenda(agenda_items)
|
||||
|
||||
|
||||
def process_agenda_item(index: int, item: WebElement) -> AgendaItem:
|
||||
def process_agenda_item(index: int, item: WebElement) -> types.AgendaItem:
|
||||
tds = item.find_elements_by_xpath("td")
|
||||
item_link = str(tds[0].find_element_by_tag_name("a").get_property("href")).strip()
|
||||
number = str(tds[0].find_element_by_tag_name("a").text).strip()
|
||||
|
@ -105,14 +67,14 @@ def process_agenda_item(index: int, item: WebElement) -> AgendaItem:
|
|||
motion_link = str(tds[5].find_element_by_tag_name("a").get_property("href")).strip()
|
||||
motion_reference = str(tds[5].find_element_by_tag_name("a").text).strip()
|
||||
|
||||
return AgendaItem(number=number, order=index, name=name,
|
||||
public=public, link=item_link,
|
||||
motion_link=motion_link, motion_reference=motion_reference,
|
||||
resolution_text="")
|
||||
return types.AgendaItem(number=number, order=index, name=name,
|
||||
public=public, link=item_link,
|
||||
motion_link=motion_link, motion_reference=motion_reference,
|
||||
resolution_text="")
|
||||
|
||||
|
||||
def get_motions(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> Dict[str, Motion]:
|
||||
motions: Dict[str, Motion] = dict()
|
||||
def get_motions(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> Dict[str, types.Motion]:
|
||||
motions: Dict[str, types.Motion] = dict()
|
||||
for _meeting in meetings:
|
||||
agenda_items = _meeting.agenda.agenda_items
|
||||
for agenda_item in agenda_items:
|
||||
|
@ -124,7 +86,7 @@ def get_motions(driver: webdriver.Firefox, meetings: List[meeting.Meeting]) -> D
|
|||
return motions
|
||||
|
||||
|
||||
def get_motion(driver: webdriver.Firefox, agenda_item_link: str, link: str, reference: str) -> Motion:
|
||||
def get_motion(driver: webdriver.Firefox, agenda_item_link: str, link: str, reference: str) -> types.Motion:
|
||||
driver.get(link)
|
||||
meta_table = driver.find_element_by_xpath("//table[@class='risdeco']//tr[2]//td[2]//table//tr//td[1]//table")
|
||||
meta_trs = meta_table.find_elements_by_xpath("./tbody//tr")
|
||||
|
@ -140,12 +102,18 @@ def get_motion(driver: webdriver.Firefox, agenda_item_link: str, link: str, refe
|
|||
is_organization_header = tds[1].get_attribute("class") == "text1"
|
||||
if is_organization_header:
|
||||
current_organization = str(tds[1].text).strip()
|
||||
current_role = str(tds[2].text).strip()
|
||||
if len(tds) >= 3:
|
||||
current_role = str(tds[2].text).strip()
|
||||
else:
|
||||
current_role = None
|
||||
else:
|
||||
authoritative = str(tds[0].get_property("title")).strip() == "Erledigt" \
|
||||
and str(tds[4].text).strip() in ["beschlossen", "zur Kenntnis genommen"]
|
||||
and str(tds[4].text).strip() in ["beschlossen", "zur Kenntnis genommen", "abgelehnt"]
|
||||
link_exists = len(tds[3].find_elements_by_xpath("a")) > 0
|
||||
if not link_exists:
|
||||
continue
|
||||
meeting_link = str(tds[3].find_element_by_xpath("a").get_property("href")).strip()
|
||||
consultations.append(Consultation(
|
||||
consultations.append(types.Consultation(
|
||||
authoritative=authoritative, meeting=meeting_link,
|
||||
organization=[current_organization], role=current_role,
|
||||
agenda_item=agenda_item_link, result=str(tds[2].text).strip()
|
||||
|
@ -181,7 +149,7 @@ def get_motion(driver: webdriver.Firefox, agenda_item_link: str, link: str, refe
|
|||
petition += str(p.text).strip()
|
||||
petition.rstrip()
|
||||
|
||||
return Motion(name=name, reference=reference,
|
||||
type=motion_type, under_direction_of=under_direction_of,
|
||||
context=context, petition=petition, consultations=consultations,
|
||||
file=file_link)
|
||||
return types.Motion(name=name, reference=reference,
|
||||
type=motion_type, under_direction_of=under_direction_of,
|
||||
context=context, petition=petition, consultations=consultations,
|
||||
file=file_link)
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
|
||||
@dataclass
|
||||
class Consultation:
|
||||
authoritative: bool
|
||||
agenda_item: str
|
||||
meeting: str
|
||||
organization: List[str]
|
||||
role: str
|
||||
result: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Motion:
|
||||
consultations: List[Consultation]
|
||||
context: str
|
||||
file: str
|
||||
name: str
|
||||
reference: str
|
||||
petition: str
|
||||
type: str
|
||||
under_direction_of: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class AgendaItem:
|
||||
number: str
|
||||
order: int
|
||||
name: str
|
||||
public: bool
|
||||
link: str
|
||||
motion_link: str
|
||||
motion_reference: str
|
||||
resolution_text: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Agenda:
|
||||
agenda_items: List[AgendaItem]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Membership:
|
||||
person: str
|
||||
organization: str
|
||||
role: str
|
||||
on_behalf_of: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Organization:
|
||||
classification: str
|
||||
membership: List[Membership]
|
||||
name: str
|
||||
organization_type: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Person:
|
||||
name: str
|
||||
form_of_address: str
|
||||
phone: List[str]
|
||||
email: List[str]
|
|
@ -13,7 +13,6 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import configparser
|
||||
import os
|
||||
from datetime import date
|
||||
|
@ -33,7 +32,7 @@ from twomartens.allrisscraper import meeting
|
|||
from twomartens.allrisscraper.definitions import ALLRIS_LOGIN
|
||||
|
||||
|
||||
def main(args: argparse.Namespace) -> None:
|
||||
def main(_) -> None:
|
||||
config_file = f"{os.getcwd()}/tm-allris-scraper-config.ini"
|
||||
if not config_module.initialize_config(config_file):
|
||||
return
|
||||
|
@ -207,7 +206,3 @@ def get_abbreviation(name):
|
|||
else:
|
||||
abbreviated_name = f"{abbreviated_name}{part[:1].capitalize()}"
|
||||
return abbreviated_name
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -9,12 +9,13 @@ def main():
|
|||
subparsers = parser.add_subparsers(help="sub-command help", required=True)
|
||||
oparl_parser = subparsers.add_parser("oparl", help="scrapes the public website")
|
||||
oparl_parser.add_argument("--include-organizations", action="store_true", dest="include_organizations")
|
||||
oparl_parser.add_argument("--include-meetings", action="store_true", dest="include_meetings")
|
||||
oparl_parser.set_defaults(function=public.main)
|
||||
internal_parser = subparsers.add_parser("internal", help="scrapes the internal website")
|
||||
internal_parser.set_defaults(function=internal.main)
|
||||
|
||||
args = parser.parse_args()
|
||||
args.func(args)
|
||||
args.function(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -23,7 +23,7 @@ from selenium import webdriver
|
|||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.firefox.webelement import FirefoxWebElement
|
||||
|
||||
from twomartens.allrisscraper.agenda import Agenda
|
||||
from twomartens.allrisscraper import data_types
|
||||
from twomartens.allrisscraper.definitions import MONTHS
|
||||
|
||||
|
||||
|
@ -36,7 +36,7 @@ class Meeting:
|
|||
link: str
|
||||
location: str
|
||||
address: Optional[str]
|
||||
agenda: Optional[Agenda]
|
||||
agenda: Optional[data_types.Agenda]
|
||||
|
||||
|
||||
def get_meetings(driver: webdriver, base_url: str):
|
||||
|
|
|
@ -1,27 +1,12 @@
|
|||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.remote.webelement import WebElement
|
||||
|
||||
|
||||
@dataclass
|
||||
class Membership:
|
||||
person: str
|
||||
organization: str
|
||||
role: str
|
||||
on_behalf_of: str
|
||||
from twomartens.allrisscraper import data_types as types
|
||||
|
||||
|
||||
@dataclass
|
||||
class Organization:
|
||||
classification: str
|
||||
membership: List[Membership]
|
||||
name: str
|
||||
organization_type: str
|
||||
|
||||
|
||||
def get_organizations(driver: webdriver.Firefox, base_url: str) -> List[Organization]:
|
||||
def get_organizations(driver: webdriver.Firefox, base_url: str) -> List[types.Organization]:
|
||||
organizations = [get_organization(driver=driver,
|
||||
link=f"{base_url}/pa021.asp",
|
||||
classification="Bezirksversammlung",
|
||||
|
@ -34,7 +19,7 @@ def get_organizations(driver: webdriver.Firefox, base_url: str) -> List[Organiza
|
|||
return organizations
|
||||
|
||||
|
||||
def get_committees(driver: webdriver.Firefox, link: str) -> List[Organization]:
|
||||
def get_committees(driver: webdriver.Firefox, link: str) -> List[types.Organization]:
|
||||
driver.get(link)
|
||||
committee_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table//tr[not(contains(@class, 'zw1'))]")[2:-1]
|
||||
organizations = []
|
||||
|
@ -48,11 +33,11 @@ def get_committees(driver: webdriver.Firefox, link: str) -> List[Organization]:
|
|||
for link in links:
|
||||
organizations.append(get_organization(driver=driver, link=link,
|
||||
classification="Ausschuss", organization_type="Gremium"))
|
||||
|
||||
|
||||
return organizations
|
||||
|
||||
|
||||
def get_factions(driver: webdriver.Firefox, link: str) -> List[Organization]:
|
||||
def get_factions(driver: webdriver.Firefox, link: str) -> List[types.Organization]:
|
||||
driver.get(link)
|
||||
driver.get(link)
|
||||
faction_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table//tr")[2:-1]
|
||||
|
@ -71,22 +56,26 @@ def get_factions(driver: webdriver.Firefox, link: str) -> List[Organization]:
|
|||
return organizations
|
||||
|
||||
|
||||
def get_organization(driver: webdriver.Firefox, link: str, classification: str, organization_type: str) -> Organization:
|
||||
def get_organization(driver: webdriver.Firefox, link: str, classification: str,
|
||||
organization_type: str) -> types.Organization:
|
||||
driver.get(link)
|
||||
name = str(driver.find_element_by_xpath("//div[@id='risname']").text)
|
||||
name = str(driver.find_element_by_xpath("//div[@id='risname']").text).strip()
|
||||
memberships = []
|
||||
member_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table//tr")[2:-1]
|
||||
member_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table[2]//tr")[2:-1]
|
||||
for member_tr in member_trs:
|
||||
memberships.append(get_membership(member_tr, name))
|
||||
|
||||
return Organization(name=name, classification=classification,
|
||||
organization_type=organization_type, membership=memberships)
|
||||
return types.Organization(name=name, classification=classification,
|
||||
organization_type=organization_type, membership=memberships)
|
||||
|
||||
|
||||
def get_membership(member_tr: WebElement, organization: str) -> Membership:
|
||||
def get_membership(member_tr: WebElement, organization: str) -> types.Membership:
|
||||
tds = member_tr.find_elements_by_xpath("td")
|
||||
person_link = str(tds[2].find_element_by_xpath("a").get_property("href")).strip()
|
||||
if len(tds[2].find_elements_by_xpath("a")) == 0:
|
||||
person_link = ""
|
||||
else:
|
||||
person_link = str(tds[2].find_element_by_xpath("a").get_property("href")).strip()
|
||||
role = str(tds[3].text).strip()
|
||||
on_behalf_of = str(tds[4].text).strip()
|
||||
|
||||
return Membership(person=person_link, organization=organization, role=role, on_behalf_of=on_behalf_of)
|
||||
return types.Membership(person=person_link, organization=organization, role=role, on_behalf_of=on_behalf_of)
|
||||
|
|
|
@ -1,45 +1,53 @@
|
|||
from dataclasses import dataclass
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
|
||||
from selenium import webdriver
|
||||
|
||||
from twomartens.allrisscraper.organization import Organization
|
||||
from twomartens.allrisscraper import data_types as types
|
||||
|
||||
|
||||
@dataclass
|
||||
class Person:
|
||||
name: str
|
||||
form_of_address: str
|
||||
phone: List[str]
|
||||
email: List[str]
|
||||
|
||||
|
||||
def get_persons(driver: webdriver.Firefox, organizations: List[Organization]) -> List[Person]:
|
||||
persons: Dict[str, Person] = {}
|
||||
def get_persons(driver: webdriver.Firefox, organizations: List[types.Organization]) -> List[types.Person]:
|
||||
persons: Dict[str, types.Person] = {}
|
||||
for org in organizations:
|
||||
memberships = org.membership
|
||||
for membership in memberships:
|
||||
person_link = membership.person
|
||||
if person_link in persons:
|
||||
continue
|
||||
if person_link == "":
|
||||
continue
|
||||
persons[person_link] = get_person(driver=driver, link=person_link)
|
||||
|
||||
|
||||
return list(persons.values())
|
||||
|
||||
|
||||
def get_person(driver: webdriver.Firefox, link: str) -> Person:
|
||||
def get_person(driver: webdriver.Firefox, link: str) -> types.Person:
|
||||
driver.get(link)
|
||||
meta_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table//tr//td//table//tr")
|
||||
meta_trs = driver.find_elements_by_xpath("//div[@id='rismain']//table[1]//tr//td//table//tr")
|
||||
form_of_address = str(meta_trs[0].find_element_by_xpath("td[3]").text).strip()
|
||||
name = str(meta_trs[1].find_element_by_xpath("td").text).strip()
|
||||
phone_tds = meta_trs[5].find_elements_by_xpath("td")
|
||||
phone = ""
|
||||
if len(phone_tds) > 1:
|
||||
phone = str(meta_trs[5].find_element_by_xpath("td[2]//span").text).strip()
|
||||
email_tds = meta_trs[6].find_elements_by_xpath("td")
|
||||
email = ""
|
||||
if len(email_tds) > 1:
|
||||
email = str(meta_trs[6].find_element_by_xpath("td[2]//a").text).strip()
|
||||
phone = []
|
||||
email = []
|
||||
additional_trs = meta_trs[2:]
|
||||
for tr in additional_trs:
|
||||
tds = tr.find_elements_by_tag_name("td")
|
||||
if len(tds) == 1 and str(tds[0].text).strip() == "":
|
||||
continue
|
||||
images = tr.find_elements_by_xpath("td[1]//img")
|
||||
if len(images) == 0:
|
||||
continue
|
||||
alt = images[0].get_property("alt")
|
||||
if is_email_row(alt):
|
||||
email.append(str(tr.find_element_by_xpath("td[2]//a").text).strip())
|
||||
if is_phone_row(alt):
|
||||
phone.append(str(tr.find_element_by_xpath("td[2]//span").text).strip())
|
||||
|
||||
return Person(name=name, form_of_address=form_of_address, phone=[phone], email=[email])
|
||||
return types.Person(name=name, form_of_address=form_of_address, phone=phone, email=email)
|
||||
|
||||
|
||||
def is_email_row(alt: str) -> bool:
|
||||
return "eMail" in alt
|
||||
|
||||
|
||||
def is_phone_row(alt: str) -> bool:
|
||||
return "Tel" in alt
|
||||
|
|
|
@ -15,8 +15,6 @@ from twomartens.allrisscraper import meeting
|
|||
from twomartens.allrisscraper import organization
|
||||
from twomartens.allrisscraper import person
|
||||
|
||||
XPATH_2ND_TD = "td[2]"
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
config_file = f"{os.getcwd()}/tm-allris-scraper-config.ini"
|
||||
|
@ -35,27 +33,26 @@ def main(args: argparse.Namespace):
|
|||
binary = FirefoxBinary(firefox_binary)
|
||||
driver = webdriver.Firefox(firefox_binary=binary, options=options)
|
||||
driver.implicitly_wait(2)
|
||||
meetings = meeting.get_meetings(driver, base_url)
|
||||
agenda.process_agendas(driver, meetings)
|
||||
motions = agenda.get_motions(driver, meetings)
|
||||
organizations = []
|
||||
persons = []
|
||||
os.makedirs(json_path, exist_ok=True)
|
||||
if args.include_meetings:
|
||||
meetings = meeting.get_meetings(driver, base_url)
|
||||
agenda.process_agendas(driver, meetings)
|
||||
motions = agenda.get_motions(driver, meetings)
|
||||
with open(json_path + "meetings.json", "w") as file:
|
||||
json.dump(meetings, file,
|
||||
cls=custom_json.EnhancedJSONEncoder)
|
||||
with open(json_path + "motions.json", "w") as file:
|
||||
json.dump(motions, file,
|
||||
cls=custom_json.EnhancedJSONEncoder)
|
||||
|
||||
if args.include_organizations:
|
||||
organizations = organization.get_organizations(driver, base_url)
|
||||
persons = person.get_persons(driver, organizations)
|
||||
driver.close()
|
||||
|
||||
os.makedirs(json_path, exist_ok=True)
|
||||
with open(json_path + "meetings.json", "w") as file:
|
||||
json.dump(meetings, file,
|
||||
cls=custom_json.EnhancedJSONEncoder)
|
||||
with open(json_path + "motions.json", "w") as file:
|
||||
json.dump(motions, file,
|
||||
cls=custom_json.EnhancedJSONEncoder)
|
||||
if args.include_organizations:
|
||||
with open(json_path + "organizations.json", "w") as file:
|
||||
json.dump(organizations, file,
|
||||
cls=custom_json.EnhancedJSONEncoder)
|
||||
with open(json_path + "persons.json", "w") as file:
|
||||
json.dump(persons, file,
|
||||
cls=custom_json.EnhancedJSONEncoder)
|
||||
|
||||
driver.close()
|
||||
|
|
Chargement…
Référencer dans un nouveau ticket