-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_regressor.py
78 lines (61 loc) · 2.76 KB
/
text_regressor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# -*- coding: utf-8 -*-
"""text_regressor.ipynb
Automatically generated by Colaboratory.
"""
# !pip install -q flair
# !apt install aptitude swig
# !aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
# !pip install mecab-python3
# !pip install konoha[mecab]
from flair.data import Corpus, build_japanese_tokenizer, Sentence
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models.text_regression_model import TextRegressor
from flair.trainers import ModelTrainer
from flair.datasets import CSVClassificationCorpus
# instantiate Japanese tokenizer
japanese_tokenizer = build_japanese_tokenizer()
# this is the folder in which train, test and dev files reside
data_folder = 'resources/data'
# column format indicating which columns hold the text and label(s)
column_name_map = {1: "text", 0: "label_topic"}
# load corpus containing training, test and dev data and if CSV has a header, you can skip it
corpus: Corpus = CSVClassificationCorpus(data_folder,
column_name_map,
skip_header=True,
delimiter='\t', # tab-separated files
tokenizer=japanese_tokenizer,
)
corpus.train.raw_data
# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()
# 3. make a list of word embeddings
word_embeddings = [
FlairEmbeddings('ja-forward'),
FlairEmbeddings('ja-backward'),
]
# 4. initialize document embedding by passing list of word embeddings
# Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings,
hidden_size=512,
reproject_words=True,
reproject_words_dimension=256,
)
# 5. create the text regressor
regressor = TextRegressor(document_embeddings)
# 6. initialize the text regressor trainer
trainer = ModelTrainer(regressor, corpus)
# 7. start the training
trainer.train('resources/regressor',
learning_rate=0.1,
mini_batch_size=32,
anneal_factor=0.5,
patience=5,
max_epochs=150)
# load the model you trained
#model = TextRegressor.load('resources/regressor/best-model.pt')
# create example sentence
sentence = Sentence('Windows起動時に', use_tokenizer=japanese_tokenizer)
print(sentence.to_tokenized_string())
# predict tags and print
regressor.predict(sentence)
sentence.to_dict()