-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbert-topic.py
67 lines (53 loc) · 2.22 KB
/
bert-topic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from bertopic import BERTopic
from load_texts import *
print('loading texts...')
df = return_data_frame()
# print(df.info())
print('removing stop words...')
Mystopwords = Mystopwords + ['aa','anterior','ao','banco','carlos','central','comitê','conforme','copom','dia','dias','dos','doze',
'edições','entanto','hoje','item','luiz','membros','mensal','meses','mil','monetária','número','os','otávio','pb','período',
'política','pp','quarto','realizou','relação','repectivamente','reunião','se','três','últimas',
'trimestre','trimestres','variou','']
corpus = df['text'].to_list()
PARAGRAPHS = True
if PARAGRAPHS:
corpusJoined = '\n'.join(corpus)
corpusJoined = re.sub('\n \n', '\n', corpusJoined)
corpus_aux = corpusJoined.split('\n')
corpus = []
for txt in corpus_aux:
if (txt != '') & (txt != ' '):
corpus.append(txt)
print(len(corpus), "paragraphs")
ngram_range = (1, 1)
minTopicSize = 100
else:
ngram_range = (2, 3)
minTopicSize = 10
for i in range(0, len(corpus)):
words = corpus[i].split(" ")
words_new = [w for w in words if w not in Mystopwords]
corpus[i] = ' '.join(words_new)
del words, words_new
print('running bertopic...')
topic_model = BERTopic(language='multilingual',
verbose=True,
calculate_probabilities=True,
n_gram_range=ngram_range,
min_topic_size=minTopicSize)
topics, probs = topic_model.fit_transform(corpus)
topic_info = topic_model.get_topic_info()
print(topic_info[["Topic","Count","Representation"]])
for i in range(len(topic_model.get_topics())):
if i < topic_info.shape[0]-1:
print('Topic %.0f' % i)
print([n for n, p in topic_model.get_topic(i)])
# print(topic_model.get_representative_docs(topic=i))
print()
doc_info = topic_model.get_document_info(corpus)
print(doc_info[["Document","Topic","Probability","Top_n_words"]])
topic_model.visualize_barchart(top_n_topics=12, n_words=10, height=500)
for i in range(0, len(corpus)):
words = corpus[i].split(" ")
if len(words) == 0:
print(i)