-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
103 lines (92 loc) · 3.53 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from pathlib import Path
import re
from tqdm import tqdm
import deal
from LDS.book_loader import BookLoader
from LDS.summarizer_ios import read_references
from LDS.summarizer_ios import output_summaries, output_scores
from LDS.evaluate import evaluate
from LDS.summarizer_factory import summarizer_factory, ModelName
from LDS.nlp_utils import RE_ALPHA
from LDS.textrank import TextRank
# deal.enable()
deal.disable()
MODEL_NAME: ModelName = "textrank"
# NOTE: textrank_french_semantic has the overall best scores
SENTENCE_ENCODER: TextRank.SentenceEncoder = "french_semantic"
SUMMARIES_OUT_PATH = Path("data/output_summaries/").resolve()
SCORES_OUT_PATH = Path("scores/textrank/").resolve()
book = BookLoader(
doc_path = "data/D5627-Dolan.docx",
markers = {
"slice": [r"^Introduction$", r"Annexe /$"],
"chapter": r"^Chapitre \d+ /$|^Conclusion$",
"headers": r"^Chapitre \d+ /.+"
r"|^Introduction$"
r"|^Stress, santé et performance au travail$"
r"|^Conclusion$",
"footnotes": re.compile(
r""".+?[A-Z]\. # At least one character + a capital letter + a dot
\s.*? # + Whitespace + any # of characters
\(\d{4}\) # + 4 digits within parens
""", re.VERBOSE), # e.g. "12 Zuckerman, M. (1971). Dimensions of ..."
"undesirables": re.compile(
r"""^CONFUCIUS$
|^Matière à réFlexion$
|^/\tPost-scriptum$
|^<www\.pbs\.org/bodyandsoul/218/meditation\.htm>.+?\.$
|^Source\s:\s
""", re.VERBOSE),
"citing": re.compile(
rf"""((?:{RE_ALPHA}){3,}?) # Capture at least 3 alphabetic characters
\d+ # + at least one digit
""", re.VERBOSE), # e.g. "cited1"
"na_span": [
# Starts with this:
r"^exerCiCe \d\.\d /$",
# Ends with any of these:
r"^Chapitre \d+ /$"
r"|^Conclusion$"
r"|^Les caractéristiques personnelles\."
r"|/\tLocus de contrôle$"
r"|^L'observation de sujets a amené Rotter"
r"|^Lorsqu'une personne souffre de stress"]
}
)
observed_lengths = [len(c) for c in book.chapters]
expected_lengths = [30155, 48537, 70349, 71779, 87327, 96484, 11090]
assert observed_lengths == expected_lengths
chapters_to_summarize = book.get_chapters(1, 3)
references = read_references(Path("data/references/").resolve())
assert len(chapters_to_summarize) == len(references)
print("GENERATING SUMMARIES PER CHAPTER...")
summarizer, get_summary_len = summarizer_factory( # pylint: disable=unpacking-non-sequence
MODEL_NAME, sentence_encoder=SENTENCE_ENCODER
)
summary_units = [
{
"CHAPTER": idx + 1,
"SUMMARY": summarizer(chapter, get_summary_len(ref)),
"REFERENCE": ref
}
for idx, (chapter, ref) in
tqdm(enumerate(zip(chapters_to_summarize, references)),
total=len(references))
]
model_name = (f"{MODEL_NAME}_{SENTENCE_ENCODER}"
if MODEL_NAME == "textrank"
else MODEL_NAME)
output_summaries(
summary_units,
out_path=SUMMARIES_OUT_PATH,
model_name=model_name,
post_read_sample=False
)
summaries = [su["SUMMARY"] for su in summary_units]
references = [su["REFERENCE"] for su in summary_units]
scores = evaluate(summaries, references)
output_scores(
scores,
out_path=SCORES_OUT_PATH,
model_name=model_name
)