-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_motioner.py
153 lines (118 loc) · 4.78 KB
/
parse_motioner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import io
import json
import pandas as pd
import multiprocessing as mp
from bs4 import BeautifulSoup, NavigableString
def read_motion_json(filename, folder="data_json"):
with io.open(os.path.join(folder, filename), "r", encoding="utf-8-sig") as f:
motion_json = json.load(f)
return motion_json
def author_party_count(motion):
authors = motion["dokumentstatus"]["dokintressent"]["intressent"]
party_count = {
"V": 0,
"S": 0,
"MP": 0,
"C": 0,
"L": 0,
"KD": 0,
"M": 0,
"SD": 0,
"-": 0,
}
if isinstance(authors, dict):
authors = [authors]
for author in authors:
party_count[author["partibet"]] += 1
return party_count
def parse_html(motion):
motion_html = motion["dokumentstatus"]["dokument"]["html"]
soup = BeautifulSoup(motion_html, "html.parser")
if motion["dokumentstatus"]["dokument"]["status"] == "Utgången":
return None
# Remove signatures at bottom of bill that contain party affiliations
for signature in soup.find_all(
"p", attrs={"style": "-aw-sdt-tag:CC_Underskrifter; -aw-sdt-title:CC_Underskrifter"}
):
signature.extract()
for table in soup.find_all("table"):
table.extract()
# Add periods to headers, so we can perform sentence segmentation later if necessary
for title in soup.find_all(["h1", "h2", "h3"]):
for span in title.find_all("span"):
span.insert(len(span.get_text()), NavigableString("."))
# We only want text after motion has started
if soup.find("a", attrs={"name": "MotionsStart"}) is not None:
motion_start = soup.find("a", attrs={"name": "MotionsStart"})
elif soup.find("h1") is not None:
motion_start = soup.find("h1")
else:
# Ignore 4 motioner that don't conform to html formatting standard
print(print(motion["dokumentstatus"]["dokument"]["dok_id"]))
return None
# Find all headers and paragraphs after motion start
soup = motion_start.find_all_next(["h1", "h2", "h3", "p"])
soup = BeautifulSoup("".join([str(tag) for tag in soup]), "html.parser")
motion_text = "".join(tag.get_text() for tag in soup)
motion_text = " ".join(motion_text.split()) # Remove newlines, excessive whitespace
party_count = author_party_count(motion)
motion_fields = {
"dok_id": motion["dokumentstatus"]["dokument"]["dok_id"],
"text": motion_text,
"datum": motion["dokumentstatus"]["dokument"]["datum"],
"dokument_url_html": motion["dokumentstatus"]["dokument"]["dokument_url_html"],
"titel": motion["dokumentstatus"]["dokument"]["titel"],
"subtitel": motion["dokumentstatus"]["dokument"]["subtitel"],
"organ": motion["dokumentstatus"]["dokument"]["organ"],
"subtyp": motion["dokumentstatus"]["dokument"]["subtyp"],
"party_count": [party_count],
"hangar_id": motion["dokumentstatus"]["dokument"]["hangar_id"],
}
return motion_fields
def get_motion_text(filename, folder="data_json"):
"""
Parse folder with bunch of json files of parliamentary motions.
"""
motion_json = read_motion_json(filename, folder)
motion_text = parse_html(motion_json)
return motion_text
motioner_list = os.listdir("data_json")
pool = mp.Pool()
res = pool.map(get_motion_text, motioner_list)
pool.close()
res = [motion for motion in res if motion is not None]
df = pd.json_normalize(res)
df["datum"] = pd.to_datetime(df["datum"])
df["hangar_id"] = pd.to_numeric(df["hangar_id"])
df_authors = pd.DataFrame(df["party_count"].explode().tolist()).add_prefix("authors_")
df_authors["nr_authors"] = df_authors.sum(axis=1)
df_authors = df_authors.rename(columns={"authors_-": "authors_independent"})
df_authors["party"] = (df_authors.iloc[:, 0:9] >= 1).apply(
lambda col: ",".join(col.index[col].str.slice(8)), axis=1
)
df = pd.concat([df.drop("party_count", axis=1), df_authors], axis=1)
df["single_party_authors"] = (
df[
[
"authors_V",
"authors_S",
"authors_MP",
"authors_C",
"authors_L",
"authors_KD",
"authors_M",
"authors_SD",
"authors_independent",
]
]
>= 1
).sum(axis="columns")
df["single_party_authors"] = df["single_party_authors"] <= 1
df["text"] = df["text"].str.replace("^\.", "", regex=True)
df["text"] = df["text"].str.replace("^Motivering.", "", regex=True).str.strip()
df["text"] = df["text"].str.replace("^Bakgrund.", "", regex=True).str.strip()
df["text"] = df["text"].str.replace("^Inledning", "", regex=True).str.strip()
# https://stackoverflow.com/questions/51976328/best-way-to-remove-xad-in-python
df["text"] = df["text"].str.replace("\xad", "", regex=True)
df.to_parquet("motioner_2014_2021.parquet")