Skip to content

Commit

Permalink
Fix glosstag surface matching
Browse files Browse the repository at this point in the history
  • Loading branch information
letuananh committed Apr 12, 2018
1 parent 63d9bf4 commit f8c07c5
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 15 deletions.
12 changes: 6 additions & 6 deletions test/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,6 @@ def getLogger():

class TestHelperMethods(unittest.TestCase):

nullrep = TextReport('/dev/null')

def test_dump_synset(self):
print("Test get synset by ID")
gwn = get_gwn()
Expand All @@ -91,12 +89,14 @@ def test_dump_synsets(self):
dump_synsets(None)

def test_get_by_term(self):
sses = get_synsets_by_term(GWNSQL(YLConfig.GWN30_DB), 'test', report_file=self.nullrep)
self.assertEqual(len(sses), 13)
with TextReport.null() as rp:
sses = get_synsets_by_term(GWNSQL(YLConfig.GWN30_DB), 'test', report_file=rp)
self.assertEqual(len(sses), 13)

def test_get_by_sk(self):
ss = get_synset_by_sk(get_gwn(), 'test%2:41:00::', report_file=self.nullrep)
self.assertIsNotNone(ss)
with TextReport.null() as rp:
ss = get_synset_by_sk(get_gwn(), 'test%2:41:00::', report_file=rp)
self.assertIsNotNone(ss)

def test_search_wn_full_text(self):
rp = TextReport.string()
Expand Down
29 changes: 20 additions & 9 deletions yawlib/glosswordnet/gwnmodels.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,13 @@
# Models
# -----------------------------------------------------------------------

logger = logging.getLogger(__name__)
REMAIN_PATTERN = re.compile('[" ;]')


def getLogger():
return logging.getLogger(__name__)


# -----------------------------------------------------------------------
# Models
# -----------------------------------------------------------------------
Expand Down Expand Up @@ -128,16 +131,18 @@ def get_tags(self):
return keys

def match_surface(self, raws=None):
''' Match tokens in each gloss to original synset def+ex string '''
raws = self.get_orig().split() if raws is None else list(raws)
glosses = list(self.glosses)
glosses = list(self.glosses) # remaining glosses
logger = getLogger()
try:
# try to match normally first
for r, g in zip(raws, glosses):
tokens = [t.text for t in g]
while tokens[-1] == ';':
tokens.pop()
sent = ttl.Sentence(r)
sent.import_tokens(tokens)
sent.tokens = tokens
# seems ok ...
for r, g in zip(raws, glosses):
g.surface = r
Expand All @@ -148,9 +153,9 @@ def match_surface(self, raws=None):
d = self.get_def()
for idx, raw in enumerate(raws):
sent = ttl.Sentence(raw)
tokens = [i.text for i in d.items]
try:
tokens = [i.text for i in d.items]
sent.import_tokens(tokens)
sent.tokens = tokens
# found the def raw
if "(" in raw:
new_part = raw.replace("(", ";(").split(";")
Expand All @@ -162,21 +167,26 @@ def match_surface(self, raws=None):
continue
while len(raws) > 0:
raw = raws.pop()
s = ttl.Sentence(raw)
for idx, g in enumerate(glosses):
s = ttl.Sentence(raw)
tokens = [t.text for t in g.items]
# logger.debug("raw = {} | tokens = {}".format(s.text, tokens))
while tokens[-1] == ';':
tokens.pop()
try:
s.import_tokens(tokens)
g.surface = raw
glosses.pop(idx) # remove this gloss as it's matched
raw = None
break
except:
# move on to the next one
# logger.exception("Failed to match: {} to {}".format(g.text(), tokens))
pass
if raw:
logger.warning("Could not match {} to anything | remaining glosses: {}".format(raw, [g.text() for g in glosses]))
if len(glosses) > 0:
raise Exception("mismatched {}".format(glosses))
raise Exception("mismatched! Remaining glosses: {} | orig: {}".format([g.text() for g in glosses], self.get_orig()))
return True

def __getitem__(self, name):
Expand Down Expand Up @@ -297,9 +307,10 @@ def to_ttl(self, doc=None):
''' Export to TextTagLib format (Read more: :mod:`~chirptext.texttaglib`) '''
sid = self.origid if self.origid else "{}{}_{}".format(self.synset.ID.offset, self.synset.ID.pos, self.cat)
if doc is not None:
sent = doc.new_sent(text=self.text(), ID=sid)
sent = doc.new_sent(text=self.text())
else:
sent = ttl.Sentence(text=self.text(), ID=sid)
sent = ttl.Sentence(text=self.text())
sent.new_tag(sid, tagtype='origid')
colls = dd(list)
item_map = {}
# import tokens
Expand Down

0 comments on commit f8c07c5

Please sign in to comment.