-
Notifications
You must be signed in to change notification settings - Fork 0
/
ChasenCorpus.py
42 lines (35 loc) · 1.11 KB
/
ChasenCorpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
class ChasenCorpus(object):
def __init__(self,raw_str):
self.lemma=''
self.pron=''
self.base=''
self.pos=[]
self.conj_type=''
self.conj_form=''
self.is_bos=False
self.is_eos=False
tokens=raw_str.split('\t')
if(len(tokens) not in [1,4,6]):
raise Exception('invalid corpus line : '+raw_str)
self._extract(tokens)
def _extract(self,tokens):
if(tokens[0]=='BOS'):
self.is_bos=True
self.lemma=self.pron=self.base='BOS'
return None
if(tokens[0]=='EOS'):
self.is_eos=True
self.lemma=self.pron=self.base='EOS'
return None
self.lemma=tokens[0]
self.pron=tokens[1]
self.base=tokens[2]
parts=tokens[3].split('-')
for p in parts:
self.pos.append(p)
if(len(tokens)==6):
self.conj_type=tokens[4]
self.conj_form=tokens[5]
else:
self.conj_type=''
self.conj_form=''