-
Notifications
You must be signed in to change notification settings - Fork 2
/
canonical_form.py
93 lines (81 loc) · 3.9 KB
/
canonical_form.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python2
# -*- coding:utf-8 -*-
'''Baseline that classifies all PIE instances in a canonical form as idiomatic'''
import json, re, os
def canonical_form(data, use_test_data, context_size, data_dir):
'''
Apply canonical form baseline
'''
canonical_forms = get_canonical_forms(data, use_test_data, data_dir)
# Classify everything in canonical form as idiomatic
for PIE in data:
if (use_test_data and PIE.split == 'test') or (not use_test_data and PIE.split == 'dev'):
PIE_sentence = PIE.context[context_size]
PIE_forms = canonical_forms[PIE.pie_type]
PIE.predicted_label = 'l'
for PIE_form in PIE_forms:
if re.search(PIE_form, PIE_sentence, flags = re.I): # Ignore case
PIE.predicted_label = 'i'
break
return data
def get_canonical_forms(data, use_test_data, data_dir):
'''
Retrieves canonical forms for PIE types from dictionaries
'''
canonical_forms = {}
# Load dictionaries, lower-case and join together
ue = json.load(open(os.path.join(data_dir, 'idiom_list_ue_final.json'), 'r'))
wiktionary = json.load(open(os.path.join(data_dir, 'idiom_list_wiktionary_final.json'), 'r'))
combined_dictionary = list(set([entry.lower() for entry in ue + wiktionary]))
# Get set of PIE types
PIE_types = list(set([PIE.pie_type for PIE in data if (use_test_data and PIE.split == 'test') or (not use_test_data and PIE.split == 'dev')]))
# Select canonical forms from dictionary by matching content words
for PIE_type in PIE_types:
canonical_forms[PIE_type] = [PIE_type]
words = PIE_type.split(' ')
words = [word for word in words if word not in ['a', 'the', 'an']]
for entry in combined_dictionary:
match = True
for word in words:
if not re.search(word, entry, flags = re.I):
match = False
break # Check next entry if one of the words is not found
if match:
canonical_forms[PIE_type].append(entry)
# Deal with placeholder words (one's, someone's, etc.)
canonical_forms[PIE_type] = expand_indefinite_pronouns(canonical_forms[PIE_type])
# Filter out duplicates
canonical_forms[PIE_type] = list(set(canonical_forms[PIE_type]))
return canonical_forms
def expand_indefinite_pronouns(canonical_forms):
'''
When one's,someone's, someone, your or something occurs in a canonical
form, add canonical forms with personal pronouns added in. Don't
expand 'one', because it is too ambiguous.
'''
expanded_canonical_forms = []
possessive_pronouns = ['my', 'your', 'his', 'her', 'its', 'our', 'their']
objective_pronouns = ['me', 'you', 'him', 'her', 'us', 'them', 'it']
for canonical_form in canonical_forms:
expanded_canonical_forms.append(canonical_form)
# Add possessive pronouns only
if re.search("\\b(one's|your)\\b", canonical_form):
for possessive_pronoun in possessive_pronouns:
expanded_canonical_form = re.sub("\\b(one's|your)\\b", possessive_pronoun, canonical_form)
expanded_canonical_forms.append(expanded_canonical_form)
# Add possessive pronouns and a wildcard for other words
elif re.search("\\bsomeone's\\b", canonical_form):
for possessive_pronoun in possessive_pronouns + [unicode("\w+'s", 'utf-8')]:
expanded_canonical_form = re.sub("\\bsomeone's\\b", possessive_pronoun, canonical_form)
expanded_canonical_forms.append(expanded_canonical_form)
# Add objective pronouns and a wildcard for other words
elif re.search("\\bsomeone\\b", canonical_form):
for objective_pronoun in objective_pronouns + [unicode("\w+", 'utf-8')]:
expanded_canonical_form = re.sub("\\bsomeone\\b", objective_pronoun, canonical_form)
expanded_canonical_forms.append(expanded_canonical_form)
# Add 'it' and 'them' and a wildcard for other words
elif re.search("\\bsomething\\b", canonical_form):
for objective_pronoun in ['it', 'them', unicode("\w+", 'utf-8')]:
expanded_canonical_form = re.sub("\\bsomething\\b", objective_pronoun, canonical_form)
expanded_canonical_forms.append(expanded_canonical_form)
return expanded_canonical_forms