-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtagger.py
880 lines (704 loc) · 33.5 KB
/
tagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
import re
import os
import ujson
import string
import nltk
from subprocess import check_output
from collections import defaultdict
from ast import literal_eval as make_tuple
class RhymeTagger:
'''
Collocation-driven method of discovering rhymes in a corpus of poetic texts
--------------------------------------------------------------
For details see Plecháč, P. (2018). A Collocation-Driven Method of
Discovering Rhymes (in Czech, English, and French Poetry). In M. Fidler,
V. Cvrček (eds.), Taming the Corpus. From Inflection and Lexis to
Interpretation. Cham: Springer, 79–95.)
'''
def __init__(self):
'''
Initialize tagger
--------------------------------------------------------------
'''
# Check if NLTK punkt available otherwise download it
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
# Get regexp corresponding to syllable peaks in IPA transcription
self.define_syll_peaks()
# Get regexp corresponding to punctuation
self.define_punctuation()
def define_syll_peaks(self):
'''
Define syllable peaks by means of regexp
--------------------------------------------------------------
'''
# Any vowel-char followed by optional length mark
vowels = '[iyɨʉɯuɪʏʊeøɤoəɘɵɛœʌɔæɐaăɶɑɒ][ːˑ]?'
# Modifier indicating multichar phonemes
tiechar = chr(865)
# Modifier indicating syllabicity of consonant
syllchar = chr(809)
# Peak defined as vowel-tie-vowel OR vowel OR syllabic consonant
self.syllable_peaks = '{0}{1}{0}|{0}|.{2}'.format(
vowels, tiechar, syllchar
)
def define_punctuation(self):
'''
Define punctuation by means of regexp
--------------------------------------------------------------
'''
self.punctuation = '[' + string.punctuation + '¿«»¡…“”\(\)\[\]–—’' + ']'
def new_model(self, lang, transcribed=False, window=5, syll_max=2,
stress=True, vowel_length=True, ngram=1, ngram_length=3, same_words=True,
t_score_min=3.078, frequency_min=3, stanza_limit=False,
prob_ipa_min=0.95, prob_ngram_min = 0.95, max_iter=20,
verbose=True):
'''
Initialize new model
--------------------------------------------------------------
:lang = [string] language code as required by eSpeak
:transcribed = [boolean] whether transcription will also be passed
:window = [int] how many lines forward to look for rhymes
:syll_max = [int] maximum number of syllables taken into account
:stress = [boolean] whether to focus only on sounds after las stress
:same_words = [boolean] whether two same words may rhyme
:vowel_length = [boolean] whether vowel length should be taken into account
:ngram = [int] upon which iteration to start taking n-grams into account
(one-based indexing, 0 = diregard n-grams completely)
:ngram_length = [int] length of the n-grams
:t_score_min = [float] minimum value of t-score to add pair to train set
:frequency_min = [int] minimum number of pair occurences to add to train set
:stanza_limit = [boolean] whether rhymes can only appear within the same stanza
:prob_ipa_min = [float] minimum ipa-based probability to treat pair as rhyme
:prob_ngram_min = [float] minimum ngram-based probability to treat pair as rhyme
:max_iter = [int] maximum number of train iteratations (epochs)
:verbose = [boolean] should progress be printed out?
'''
# Parameters
self.lang = lang
self.transcribed = transcribed
self.window = window
self.syll_max = syll_max
self.stress = stress
self.same_words = same_words
self.vowel_length = vowel_length
self.ngram = ngram
self.ngram_length = ngram_length
self.t_score_min = t_score_min
self.frequency_min = frequency_min
self.stanza_limit = stanza_limit
self.prob_ipa_min = prob_ipa_min
self.prob_ngram_min = prob_ngram_min
self.max_iter = max_iter
self.verbose = verbose
# Slots to hold current stanza and poem id
self.stanza_id = 0
self.poem_id = 0
# Container for dataset. Items correspond to a single line
# and hold a tuple (rhyme_word, poem_id, stanza_id)
self.data = list()
# Container for vocabulary. Each key is a rhyme_word found in
# the dataset and holds a tuple ([components], final_ngram)
self.rhyme_vocab = dict()
# Container for overall frequencies of words, ngrams and components
self.f = defaultdict(lambda: defaultdict(int))
self.n = defaultdict(int)
# Container for training set
self.train_set = defaultdict(lambda: defaultdict(int))
# Container for probabilities derived from training set
self.probs = defaultdict(dict)
# Raise exception if language is not specified and
# transcription is not provided
if not self.lang and not self.transcribed:
raise Exception(
'Language code must be specified when transcribed == False'
)
# Print info if required
if self.verbose:
print('\nNew model initialized\n')
def add_to_model(self, poem):
'''
Add new poem to the model
--------------------------------------------------------------
:poem = [list] either a list of lines OR list of lists (stanzas >
lines), each item may be either string hold text of the line
OR ipa transcription (tagging only) OR dict holding both
orthography and ipa transcription {'text': ..., 'ipa': ...}
'''
if self.verbose:
print(' ...adding poem #{}'.format(self.poem_id+1)+' '*10, end='\r')
self.stanza_id = 0
for x in poem:
if isinstance(x, list):
for l in x:
self._parse_line(l)
self.stanza_id += 1
else:
self._parse_line(x)
self.poem_id += 1
def _parse_line(self, line):
'''
Parse line into a tuple ([sound components], final-word, n-gram,
poem_id, stanza_id) and append it to dataset
--------------------------------------------------------------
:line = [string|dict] Text of line OR dict holding both
orthography and ipa transcription {'text': ..., 'ipa': ...}
'''
# Extract the line-final word
if not self.transcribed:
rhyme_word = self._get_rhyme_word(line)
else:
rhyme_word = self._get_rhyme_word(line['text'])
# Append it to the dataset along with poem_id and rhyme_id
self.data.append((
rhyme_word, self.poem_id, self.stanza_id
))
# If this word has not been seen yet, get it's components
# and ngram and store it into vocabular
if rhyme_word and rhyme_word not in self.rhyme_vocab:
if not self.transcribed:
ipa = self._transcription(rhyme_word)
rhyme_snds = self._split_ipa_components(ipa)
else:
final_ipa = nltk.tokenize.word_tokenize(line['ipa'])[-1]
rhyme_snds = self._split_ipa_components(final_ipa)
ngram = self._final_ngram(rhyme_word)
self.rhyme_vocab[rhyme_word] = (rhyme_snds, ngram)
def _get_rhyme_word(self, text):
'''
Get final word of the line
--------------------------------------------------------------
:text = [string] in specified language
'''
# Tokenize the line
tokens = nltk.tokenize.word_tokenize(text)
# Remove punctuation
tokens = [x for x in tokens if not re.match(self.punctuation+'+$', x)]
# If line ends with a word preceded by apostrophe, merge last two
# words and clean them from puncuation (this is done to preserve)
# cases such as nape's, John's etc. and not only 's
if re.search("'[^ "+string.punctuation+"]+$", text) and len(tokens) > 1:
rhyme_word = re.sub(self.punctuation, '', ''.join(tokens[-2:]).lower())
return rhyme_word
# Return empty string if there's no word at all
elif len(tokens) == 0:
return None
# Otherwise return rhyme word
else:
rhyme_word = re.sub(self.punctuation, '', tokens[-1].lower())
return rhyme_word
def _transcription(self, text):
'''
Transcribe a text to IPA using eSpeak
--------------------------------------------------------------
:text = [string] in specified language
'''
# Transcribe entire line with eSpeak
ipa = check_output([
"espeak", "-q", "--ipa=1", '-v', self.lang, text
]).decode('utf-8').strip()
return ipa
def _split_ipa_components(self, ipa):
'''
Split IPA transcription to the list of relevant components
(syllable peaks and consonant clusters)
--------------------------------------------------------------
:ipa = [string] in IPA
'''
# Delete vowel-length marks if required
if not self.vowel_length:
ipa = re.sub('[ːˑ]', '', ipa)
# Delete stress marks if required
if not self.stress:
ipa = re.sub('ˈ', '', ipa)
# Delete blankspaces
ipa = re.sub(' ', '', ipa)
# Remove all before the final stress (if not already deleted)
ipa = ''.join(ipa.split('ˈ')[-1])
# Split transcription into components
components = re.split('('+self.syllable_peaks+')', ipa)
# Remove first component if empty
if not components[0]:
components = components[1:]
# Reduce to required number of components
if len(components) > self.syll_max:
components = components[-self.syll_max*2:]
# Reverse order of components
components.reverse()
return components
def _final_ngram(self, word):
'''
Extract line final ngram from a word
--------------------------------------------------------------
:word = [string]
'''
if len(word) > self.ngram_length:
word = word[-self.ngram_length:]
return word
def train_model(self):
'''
Train model
--------------------------------------------------------------
'''
# Check if model exists and if contains some data
if not hasattr(self, 'data') or len(self.data) == 0:
raise Exception('You need to feed the model with poems first')
# Count overall frequencies
if self.verbose:
print ('Counting overall frequencies...')
self._overall_frequencies()
# Get collocations
if self.verbose:
print('Detecting collocations...')
self._collocations()
# Perform required number of iterations
for iteration in range(self.max_iter):
if self.verbose:
print ('Learning iteration #{}... '.format(iteration+1))
# Calculate probabilities
improved = self._probabilities()
# If no improvement on probabilities, print the meassage
# and break the iterations
if not improved:
print('\n\nSystem has reached equilibrium')
break
# If there's still improvement even in the last iteration,
# Print the message and break the iteration so we don't
# build another train set for no reason
if iteration == self.max_iter - 1 and improved:
print('\n\nSystem has not reached equilibrium')
break
# Rebuild the training set
if self.ngram and iteration + 1 >= self.ngram:
self._detect_rhymes(ngram=True, update_train_set=True)
else:
self._detect_rhymes(ngram=False, update_train_set=True)
def _overall_frequencies(self):
'''
Calculate frequencies of words, word-pairs, n-grams and rhyme
components in the entire corpus
--------------------------------------------------------------
'''
# Iterarate over lines in dataset
for i,l in enumerate(self.data):
# Skip if no word at all in line
if not l[0]:
continue
# Increase frequency of rhyme word
self.f['w'][l[0]] += 1
# Increase frequency of final n-gram
ngram = self.rhyme_vocab[l[0]][1]
self.f['g'][ngram] += 1
self.n['g'] += 1
# Inrease frequency of each component and total for each position
components = self.rhyme_vocab[l[0]][0]
for j,s in enumerate(components):
self.f[j][s] += 1
self.n[j] += 1
# Iterate forward over lines that are in specified window
for j in range(i+1, i+self.window+1):
# Skip if
# (1) end of dataset was reached OR
# (2) end of poem was reached OR
# (3) end of stanza was reached and inter-stanza rhymes are forbidden
if (
j > len(self.data) - 1 or
l[1] != self.data[j][1] or
( self.stanza_limit and l[2] != self.data[j][2] )
):
continue
# Skip if no word at all in j-line
if not self.data[j][0]:
continue
# Increase frequency of word-pair
word_pair = tuple(sorted([ l[0], self.data[j][0] ]))
self.f['wp'][word_pair] += 1
self.n['wp'] += 1
def _collocations(self):
'''
Detect relevant collocations among the rhyme words
--------------------------------------------------------------
'''
# Iterate over pairs and calculate their T-scores
for w1, w2 in self.f['wp']:
# Skip if both words are the same and same-rhymes are forbidden
if not self.same_words and w1 == w2:
continue
fxy = self.f['wp'][(w1,w2)]
fx = self.f['w'][w1]
fy = self.f['w'][w2]
n = len(self.data)
t_score = (fxy - (fx * fy / n)) / (fxy ** 0.5)
# If both T-score and pair's frequency are high enough,
# add pair to the training set
if t_score > self.t_score_min and fxy > self.frequency_min:
self._add_to_train_set(w1, w2)
def _add_to_train_set(self, w1, w2):
'''
Add pair to the training set (both ngram- and individual
component-pairs)
--------------------------------------------------------------
:w1 = [string] rhyme word #1
:w2 = [string] rhyme word #2
'''
components1 = self.rhyme_vocab[w1][0]
components2 = self.rhyme_vocab[w2][0]
ngram1 = self.rhyme_vocab[w1][1]
ngram2 = self.rhyme_vocab[w2][1]
occurrences = self.f['wp'][(w1, w2)]
# Add ngram-pair
pair = tuple(sorted([ngram1,ngram2]))
self.train_set['g'][pair] += occurrences
# Add individual component-pairs
for i,c in enumerate(components1):
if i >= len(components2):
continue
pair = tuple(sorted([components1[i], components2[i]]))
self.train_set[i][pair] += occurrences
def _probabilities(self):
'''
Calculate probabilities with which pair of items (ngrams, components)
indicate that two words rhyme.
Store copy of probabilities from previous iterations so that
it may be compared and found if there is improvement or not
--------------------------------------------------------------
'''
# Store copy of probabilities from precious iterations
self.probs_previous = self.probs.copy()
# Empty the container for new probabilities
self.probs.clear()
# Iterate over types in train set (ngrams, components 1...n)
for x in self.train_set:
# Total occurrences of this particular type in train set
nt = sum(self.train_set[x].values())
# Iterate over pairs of values in train set
for a, b in self.train_set[x]:
# Relative frequency of pair in train set
ft_ab = self.train_set[x][(a,b)] / nt
# Relative frequency of both pair's items in the entire corpus
fca = self.f[x][a] / self.n[x]
fcb = self.f[x][b] / self.n[x]
# Get the probability that A and B rhyme based on
# co-occurrence of a and b
self.probs[x][tuple(sorted([a,b]))] = ft_ab / (ft_ab + fca * fcb)
# Compare both sets of probabilities and return if there was improvement
if self.probs != self.probs_previous:
return True
else:
return False
def _detect_rhymes(self, ngram=True, update_train_set=True):
'''
Count rhyme-scores for pairs of lines that are within a
specified window
--------------------------------------------------------------
:ngram = [boolean] whether to take into account n-grams
:update_train_set = [boolean] whether to update train set or to return
list of rhymes
'''
rhymes_detected = defaultdict(set)
# Iterarate over lines in dataset
for i,l in enumerate(self.data):
# Skip if no word at all in i-line
if not self.data[i][0]:
continue
# Iterate forward over lines that are in specified window
for j in range(i+1, i+self.window+1):
# Skip if
# (1) end of dataset was reached OR
# (2) end of poem was reached OR
# (3) end of stanza was reached and inter-stanza rhymes are forbidden
# (4) both words are the same and same-rhymes are forbidden
if (
j > len(self.data) - 1 or
l[1] != self.data[j][1] or
( self.stanza_limit and l[2] != self.data[j][2] ) or
( not self.same_words and l[0] == self.data[j][0] )
):
continue
# Skip if no word at all in j-line
if not self.data[j][0]:
continue
# Get rhyme score based on components
ipa_score = self._rhyme_score(l[0], self.data[j][0])
# If score is high enough
if ipa_score > self.prob_ipa_min:
# Add j to i-line and i to j-line
rhymes_detected[i].add(j)
rhymes_detected[j].add(i)
# Annotate distant rhymes
for k in rhymes_detected[i]:
if k != j:
rhymes_detected[k].add(j)
rhymes_detected[j].add(k)
# If ngrams should be used and no rhymes were found for i-line,
# iterate over window once again and perform ngram-based recognition
if not ngram:
continue
if i in rhymes_detected:
continue
for j in range(i+1, i+self.window+1):
if (
j > len(self.data) - 1 or
l[1] != self.data[j][1] or
( self.stanza_limit and l[2] != self.data[j][2] ) or
( not self.same_words and l[0] == self.data[j][0] ) or
j in rhymes_detected
):
continue
# Skip if no word at all in j-line
if not self.data[j][0]:
continue
ngram_score = self._ngram_score(l[0], self.data[j][0])
if ngram_score > self.prob_ngram_min:
rhymes_detected[i].add(j)
rhymes_detected[j].add(i)
# Update train set if required (training)
if update_train_set:
for i in rhymes_detected:
for j in rhymes_detected[i]:
if i > j:
continue
self._add_to_train_set(self.data[i][0], self.data[j][0])
# Otherwise format output and return
else:
output = self.output(rhymes_detected)
return output
def output(self, rhymes_detected):
'''
Return output in required format
--------------------------------------------------------------
:rhymes_detected = [dict] dict holding rhymes (keys are lines indices
values are lists holding indices of rhyming
counterparts)
'''
# (1) List of lists where elements of main list correspond to
# particular lines and sub-lists hold indices or its rhyming counterparts
if self.output_format == 1:
output = []
# Iterate over all lines
for i in range(len(self.data)):
# If there are some rhymes, append them to the output
if i in rhymes_detected:
output.append(sorted(list(rhymes_detected[i])))
# Otherwise append empty list
else:
output.append([])
return output
# (2|3) 2: Rhyme-chains; list of lists where each sub-list hold
# indices of lines that rhyme
# 3: ABBA-like scheme; a unique index is assigned to each rhyme
# chain, the output is a list where each element corresponds
# to single line and holds this unique index
elif self.output_format in(2,3):
output = []
# Iterate over lines that rhyme with something
for i in rhymes_detected:
# Append the list of rhymes + index of current line to the output
output.append(sorted(list(rhymes_detected[i])+[i]))
# Make the list of lists unique
output = [list(x) for x in set(tuple(x) for x in output)]
# Sort it by first element of each sublist
output = sorted(output, key=lambda x: x[0])
# (2) Return a list of rhyme-chains
if self.output_format == 2:
return output
# (3) Return ABBA-like scheme
else:
output_abba = []
for i in range(len(self.data)):
output_abba.append(next((
idx+1 for idx,elem in enumerate(output) if i in elem
), None))
return(output_abba)
def _rhyme_score(self, w1, w2):
'''
Calculate overall score based on probabilities of particular
component-pairs
--------------------------------------------------------------
:w1 = [string] rhyme word #1
:w2 = [string] rhyme word #2
'''
score = [1,1]
components1 = self.rhyme_vocab[w1][0]
components2 = self.rhyme_vocab[w2][0]
# If all components are the same, simply return score = 1
if components1 == components2:
return 1
# Otherwise iterate over components
for i,c in enumerate(components1):
if i >= len(components2):
continue
# If probability of components pair is known, get its prob
if tuple(sorted([components1[i], components2[i]])) in self.probs[i]:
p = self.probs[i][tuple(sorted([components1[i], components2[i]]))]
# Otherwise if both components are the same, assign it 0.9
elif components1[i] == components2[i]:
p = 0.99
# Otherwise assign it 0.0001
else:
p = 0.0001
# Multiply components of formula by current values
score[0] *= p
score[1] *= (1-p)
# Return overall probability
if ( score[0] + score[1] ) > 0:
return score[0] / ( score[0] + score[1])
else:
return 0
def _ngram_score(self, w1, w2):
'''
Calculate score based on probabilities of ngrams
--------------------------------------------------------------
:w1 = [string] rhyme word #1
:w2 = [string] rhyme word #2
'''
ngram1 = self.rhyme_vocab[w1][1]
ngram2 = self.rhyme_vocab[w2][1]
# If probability of ngrams pair is known, return its prob
if tuple(sorted([ngram1, ngram2])) in self.probs['g']:
return self.probs['g'][tuple(sorted([ngram1, ngram2]))]
# Otherwise if both ngrams are the same, return 0.9
elif ngram1 == ngram2:
return 0.99
# Otherwise assign it 0.0001
else:
return 0.0001
def save_model(self, file):
'''
Save model into json for future use
--------------------------------------------------------------
:file = [string] path to a json file where model will be stored
'''
model = {
'settings': {
'lang': self.lang,
'window': self.window,
'syll_max': self.syll_max,
'vowel_length': self.vowel_length,
'stress': self.stress,
'same_words': self.same_words,
'ngram': self.ngram_length,
'ngram_length': self.ngram_length,
't_score_min': self.t_score_min,
'frequency_min': self.frequency_min,
'stanza_limit': self.stanza_limit,
'prob_ipa_min': self.prob_ipa_min,
'prob_ngram_min': self.prob_ngram_min,
'max_iter': self.max_iter,
},
'probs': self.probs,
}
if not file.endswith('.json'):
file += '.json'
with open(file, 'w') as f:
f.write(ujson.dumps(model, indent=2))
def load_model(self, model=None, verbose=True):
'''
Load model from json file
--------------------------------------------------------------
:model = [string] either a name of one of the pretreained models or
path to a JSON file containing custom model
'''
if model.endswith('.json'):
with open(model, 'r') as f:
model = ujson.load(f)
else:
parent = os.path.dirname(__file__)
with open(os.path.join(parent, 'models', model+'.json'), 'r') as f:
model = ujson.load(f)
self.lang = model['settings']['lang']
self.window = model['settings']['window']
self.syll_max = model['settings']['syll_max']
self.stress = model['settings']['stress']
self.same_words = model['settings']['same_words']
self.vowel_length = model['settings']['vowel_length']
self.ngram = model['settings']['ngram']
self.ngram_length = model['settings']['ngram_length']
self.t_score_min = model['settings']['t_score_min']
self.frequency_min = model['settings']['frequency_min']
self.stanza_limit = model['settings']['stanza_limit']
self.prob_ipa_min = model['settings']['prob_ipa_min']
self.prob_ngram_min = model['settings']['prob_ngram_min']
self.max_iter = model['settings']['max_iter']
# Load probabilities (needs to get tuples back from strings)
probs = model['probs']
self.probs = defaultdict(dict)
for x in probs:
for y in probs[x]:
if re.search('^[0-9]$', x):
self.probs[int(x)][make_tuple(y)] = probs[x][y]
else:
self.probs[x][make_tuple(y)] = probs[x][y]
# Print info
if verbose:
print('='*36)
print('Model loaded with following settings:')
print('='*36)
maxlen = max(len(x) for x in model['settings'])
for x in sorted(model['settings']):
print('{}:'.format(x).rjust(maxlen+2), model['settings'][x])
print('='*36)
def tag(self, poem, transcribed=False, output_format=1, **kwargs):
'''
Perform tagging
--------------------------------------------------------------
:poem = [list] either a list of lines OR list of lists (stanzas >
lines), each item may be either string hold text of the line
OR ipa transcription (tagging only) OR dict holding both
orthography and ipa transcription {'text': ..., 'ipa': ...}
:transcribed = [boolean] whether transcription will also be passed
:output_format = [int] 1: returns list of indices for each line
2: returns list of indices for each rhyme-
3: returns classic ABBA list where ints instead of letters
[following may be passed to modify settings inherited from the model]
:lang = [string] language code as required by eSpeak
:window = [int] how many lines forward to look for rhymes
:same_words = [boolean] whether two same words may rhyme
:ngram = [int] upon which iteration to start taking n-grams into account
(one-based indexing, 0 = diregard n-grams completely)
:t_score_min = [float] minimum value of t-score to add pair to train set
:frequency_min = [int] minimum number of pair occurences to add to train set
:stanza_limit = [boolean] whether rhymes can only appear within the same stanza
:prob_ipa_min = [float] minimum ipa-based probability to treat pair as rhyme
:prob_ngram_min = [float] minimum ngram-based probability to treat pair as rhyme
'''
# Check if model is loaded
if not hasattr(self, 'probs'):
raise Exception('No model loaded. Please run load_model() first.')
# Parameters
self.transcribed = transcribed
self.output_format = output_format
if 'lang' in kwargs:
self.lang = kwargs['lang']
if 'window' in kwargs:
self.window = kwargs['window']
if 'same_words' in kwargs:
self.same_words = kwargs['same_words']
if 'ngram' in kwargs:
self.ngram = kwargs['ngram']
if 't_score_min' in kwargs:
self.t_score_min = kwargs['t_score_min']
if 'frequency_min' in kwargs:
self.frequency_min = kwargs['frequency_min']
if 'stanza_limit' in kwargs:
self.stanza_limit = kwargs['stanza_limit']
if 'prob_ipa_min' in kwargs:
self.prob_ipa_min = kwargs['prob_ipa_min']
if 'prob_ngram_min' in kwargs:
self.prob_ngram_min = kwargs['prob_ngram_min']
# Slots to hold current stanza and poem id
self.stanza_id = 0
self.poem_id = 0
# Container for dataset. Items correspond to a single line
# and hold a tuple (rhyme_word, poem_id, stanza_id)
self.data = list()
# Container for vocabulary. Each key is a rhyme_word found in
# the dataset and holds a tuple ([components], final_ngram)
self.rhyme_vocab = dict()
# Perform tagging
self.verbose = False
self.add_to_model(poem)
rhymes = self._detect_rhymes(ngram=self.ngram, update_train_set=False)
return rhymes