-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathutils_diff.py
384 lines (319 loc) · 16 KB
/
utils_diff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
from difflib import SequenceMatcher
import re, nltk
stopwords = nltk.corpus.stopwords.words('english')
stopwords += [".", ",", "?", "!", ";", ":", "\n", "-"]
stopwords = set(stopwords)
def tokenize(text):
text = re.sub(r"\s{2,}", " ", text)
text = text.replace("—", "-").replace("–", "-")
cleaned_text = text.replace(".", " .").replace(",", " ,").replace("!", " !").replace("?", " ?").replace(";", " ;").replace(":", " :").replace("\n", " \n ").replace("'", " '").replace('"', ' " ').replace("(", " (").replace(")", " )")
cleaned_text = re.sub(r"\s{2,}", " ", cleaned_text)
return cleaned_text.split(" ")
def fix_punctuation(text):
return text.replace(" .", ".").replace(" ,", ",").replace(" !", "!").replace(" ?", "?").replace(" ;", ";").replace(" :", ":").replace(" \n", "\n ").replace(" '", "'").replace(' "', '"').replace(" (", "(").replace(" )", ")")
def untokenize(tokens):
return fix_punctuation(" ".join(tokens))
def remove_stop_words(tokens):
# Expecting tokenized text
assert type(tokens) == list
return [t for t in tokens if t.lower() not in stopwords]
def split_sent_text(text):
if "." not in text:
return [text]
toks = [txt for txt in text.split(".")]
N = len(toks)
toks = [t+"." if i < N-1 else t for i, t in enumerate(toks)]
merged_toks = []
build_up = ""
for tok in toks:
# Merge if it is single letters (like U.S. )
if len(tok) == 2 and tok[:-1].isalpha():
build_up += tok
else:
if len(build_up) > 0:
merged_toks.append(build_up)
merged_toks.append(tok)
build_up = ""
if len(merged_toks) > 0 and len(merged_toks[-1].strip()) == 0:
merged_toks = merged_toks[:-1]
return merged_toks
def split_edits_on_sentence(operations):
new_operations = []
for old_op in operations:
if old_op["type"] == "equal":
new_operations.append({"type": "equal", "text": old_op["text"], "N_words": old_op["N_words"]})
if "delete" in old_op:
del_toks = split_sent_text(old_op["delete"])
for del_tok in del_toks:
new_operations.append({"type": "delete", "delete": del_tok, "N_words": del_tok.count(" ")+1})
if "insert" in old_op:
ins_toks = split_sent_text(old_op["insert"])
for ins_tok in ins_toks:
new_operations.append({"type": "insert", "insert": ins_tok, "N_words": ins_tok.count(" ")+1})
return new_operations
def get_edit_operations(text1, text2, split_replace=False, split_sentences=False, remove_stop_ws=False):
S1 = tokenize(text1)
S2 = tokenize(text2)
if remove_stop_ws:
S1 = remove_stop_words(S1)
S2 = remove_stop_words(S2)
s = SequenceMatcher(None, S1, S2)
opcodes = s.get_opcodes()
operations = []
for code, i1, i2, j1, j2 in opcodes:
if code == "insert":
operations.append({"type": "insert", "insert": untokenize(S2[j1:j2]), "N_words": len(S2[j1:j2])})
if code == "delete":
operations.append({"type": "delete", "delete": untokenize(S1[i1:i2]), "N_words": len(S1[i1:i2])})
if code == "replace":
if split_replace:
operations.append({"type": "insert", "insert": untokenize(S2[j1:j2]), "N_words": len(S1[i1:i2])})
operations.append({"type": "delete", "delete": untokenize(S1[i1:i2]), "N_words": len(S2[j1:j2])})
else:
operations.append({"type": "replace", "insert": untokenize(S2[j1:j2]), "delete": untokenize(S1[i1:i2]), "N_words": len(S2[j1:j2])+len(S1[i1:i2])})
if code == "equal":
operations.append({"type": "equal", "text": untokenize(S1[i1:i2]), "N_words": len(S1[i1:i2])})
if split_sentences:
operations = split_edits_on_sentence(operations)
return operations
def make_color(text, color, style="shell"):
assert color in ["green", "red", "blue"]
assert style in ["shell", "xml", "html", "llm", "latex"]
if style == "shell":
start_green, end_green = "\033[1;32m", "\033[0m"
start_red, end_red = "\033[1;31m", "\033[0m"
start_blue, end_blue = "\033[1;34m", "\033[0m"
elif style == "xml":
start_green, end_green = " <ins> ", " </ins> "
start_red, end_red = " <del> ", " </del> "
start_blue, end_blue = " <blue> ", " </blue> "
elif style == "latex":
start_green, end_green = " \hlblue{", "} "
start_red, end_red = " \hlred{", "} "
start_blue, end_blue = " \hlblue{ ", " } "
elif style == "html":
start_green, end_green = " <span class='green'> ", " </span> "
start_red, end_red = " <span class='red'> ", " </span> "
start_blue, end_blue = " <span class='blue'> ", " </span> "
elif style == "llm":
start_green, end_green = " ADD ", " EADD "
start_red, end_red = " DEL ", " EDEL "
start_blue, end_blue = " INFO ", " EINFO "
if color == "green":
return start_green + text + end_green
elif color == "red":
return start_red + text + end_red
elif color == "blue":
return start_blue + text + end_blue
def make_colored_text(text1, text2, style="shell", split_replace=False, split_sentences=False, remove_stop_ws=False):
operations = get_edit_operations(text1, text2, split_replace=split_replace, split_sentences=split_sentences, remove_stop_ws=remove_stop_ws)
return make_colored_text_from_operations(operations, style=style)
def make_colored_text_from_operations(operations, style="shell"):
colored_text = ""
for operation in operations:
if operation["type"] == "insert":
colored_text += make_color(operation["insert"], "green", style=style)
if operation["type"] == "delete":
colored_text += make_color(operation["delete"], "red", style=style)
if operation["type"] == "replace":
colored_text += make_color(operation["delete"], "red", style=style) + make_color(operation["insert"], "green", style=style)
if operation["type"] == "equal":
colored_text += " " + operation["text"]
lines = [line.strip() for line in colored_text.split("\n")]
text = fix_punctuation("\n".join(lines))
# Replace 2+ spaces by 1 space with re
text = re.sub(r"\s{2,}", " ", text)
return text
def highlight_operations(operations, op_idxs, style="shell"):
# Should be refactored, there's overlap
colored_text = ""
if style == "shell":
start_green, end_green = "\033[1;32m", "\033[0m"
start_red, end_red = "\033[1;31m", "\033[0m"
elif style == "xml":
start_green, end_green = " <green> ", " </green> "
start_red, end_red = " <red> ", " </red> "
else:
start_green, end_green = " ADD ", " END "
start_red, end_red = " DEL ", " END "
for idx, operation in enumerate(operations):
if operation["type"] == "equal":
colored_text += " " + operation["text"]
elif idx in op_idxs:
if operation["type"] == "insert":
colored_text += start_green + operation["insert"] + end_green
if operation["type"] == "delete":
colored_text += start_red + operation["delete"] + end_red
if operation["type"] == "replace":
colored_text += start_red + operation["delete"] + end_red + " " + start_green + operation["insert"] + end_green
else:
if operation["type"] == "insert":
pass
if operation["type"] == "delete":
colored_text += " " + operation["delete"] + " "
if operation["type"] == "replace":
colored_text += " " + operation["delete"] + " "
lines = [line.strip() for line in colored_text.split("\n")]
text = fix_punctuation("\n".join(lines))
# Replace 2+ spaces by 1 space with re
text = re.sub(r"\s{2,}", " ", text)
return text
def remove_operations(operations, op_idxs, style="shell"):
# Should be refactored, there's overlap
colored_text = ""
if style == "shell":
start_green, end_green = "\033[1;32m", "\033[0m"
start_red, end_red = "\033[1;31m", "\033[0m"
elif style == "xml":
start_green, end_green = " <ins> ", " </ins> "
start_red, end_red = " <del> ", " </del> "
elif style == "none":
start_green, end_green = " ", " "
start_red, end_red = " ", " "
else:
start_green, end_green = " ADD ", " END "
start_red, end_red = " DEL ", " END "
for idx, operation in enumerate(operations):
if operation["type"] == "equal":
colored_text += " " + operation["text"]
elif idx not in op_idxs:
if operation["type"] == "insert":
colored_text += start_green + operation["insert"] + end_green
if operation["type"] == "delete":
if style != "none":
colored_text += start_red + operation["delete"] + end_red
if operation["type"] == "replace":
if style != "none":
start_red + operation["delete"] + end_red + " "
colored_text += start_green + operation["insert"] + end_green
else:
if operation["type"] == "insert":
pass
if operation["type"] == "delete":
colored_text += " " + operation["delete"] + " "
if operation["type"] == "replace":
colored_text += " " + operation["delete"] + " "
lines = [line.strip() for line in colored_text.split("\n")]
text = fix_punctuation("\n".join(lines))
# Replace 2+ spaces by 1 space with re
text = re.sub(r"\s{2,}", " ", text)
return text
def compute_equal_ratio(text1, text2, partial=False, remove_stop_ws=False):
operations = get_edit_operations(text1, text2, remove_stop_ws=remove_stop_ws)
equal_idx = [i for i, operation in enumerate(operations) if operation["type"] == "equal"]
# We're letting go of start and end edits
if partial and len(equal_idx) > 0:
operations = operations[equal_idx[0]:equal_idx[-1]+1]
N_equal, N_total = 0, 0
for operation in operations:
N_total += operation["N_words"]
if operation["type"] == "equal":
N_equal += operation["N_words"]
return N_equal / N_total
def compute_addition_ratio(text1, text2):
operations = get_edit_operations(text1, text2)
N_added, N_removed = 0, 0
for op in operations:
if op["type"] == "insert":
N_added += op["N_words"]
if op["type"] == "delete":
N_removed += op["N_words"]
return (N_removed - N_added) / (max(N_added, N_removed)+1)
def construct_partial_pair(text1, text2):
operations = get_edit_operations(text1, text2)
equal_idx = [i for i, operation in enumerate(operations) if operation["type"] == "equal"]
if len(equal_idx) > 0:
# We're letting go of start and end edits
operations = operations[equal_idx[0]:equal_idx[-1]+1]
partial_text1 = ""
partial_text2 = ""
for operation in operations:
if operation["type"] == "insert":
partial_text2 += " " + operation["insert"]
if operation["type"] == "delete":
partial_text1 += " " + operation["delete"]
if operation["type"] == "replace":
partial_text1 += " " + operation["delete"]
partial_text2 += " " + operation["insert"]
if operation["type"] == "equal":
partial_text1 += " " + operation["text"]
partial_text2 += " " + operation["text"]
partial_text1 = fix_punctuation(partial_text1)
partial_text2 = fix_punctuation(partial_text2)
return partial_text1, partial_text2
# Next up is the overlap samples (for IAA)
def is_completed(d, anno_name):
anno_key = "annotations_%s" % anno_name
if anno_key not in d:
return False
for idx, e in enumerate(d["edits"]):
e["idx"] = idx
to_annotate_idxs = set([e["idx"] for e in d["edits"] if e["type"] != "equal"])
annotated_idxs = set([idx for anno in d[anno_key] for idx in anno["edit_idxs"]])
return to_annotate_idxs == annotated_idxs
def cleanup_annotation(annotations, edits):
stop_words = ["of", "in", "the", "on", "or", "and", "it", "its", "a", "in the", ";", ","]
num_fixes = 0
swaps = []
annotations = [anno for anno in annotations if len(anno["edit_idxs"]) > 0]
for anno in annotations:
inserts = [edits[idx].get("insert", "") for idx in anno["edit_idxs"]]
deletes = [edits[idx].get("delete", "") for idx in anno["edit_idxs"]]
inserts = [txt for txt in inserts if len(txt) > 0]
deletes = [txt for txt in deletes if len(txt) > 0]
# The edits is only stop words
if all([txt.lower() in stop_words for txt in inserts]) and all([txt.lower() in stop_words for txt in deletes]):
if anno["edit_type"] != "syntactic_generic":
swaps.append((anno["edit_type"], "syntactic_generic"))
anno["edit_type"] = "syntactic_generic"
num_fixes += 1
# Make all semantic deletions adjacent
sem_del_idxs = [idx for anno in annotations for idx in anno["edit_idxs"] if anno["edit_type"] == "semantic_deletion"]
sem_del_idxs = sorted(sem_del_idxs)
new_sem_annos = []
current_idxs = []
for idx in sem_del_idxs:
if len(current_idxs) == 0:
current_idxs.append(idx)
else:
if idx == current_idxs[-1] + 1:
current_idxs.append(idx)
else:
new_sem_annos.append({"edit_type": "semantic_deletion", "edit_idxs": current_idxs})
current_idxs = [idx]
if len(current_idxs) > 0:
new_sem_annos.append({"edit_type": "semantic_deletion", "edit_idxs": current_idxs})
annotations = [anno for anno in annotations if anno["edit_type"] != "semantic_deletion"]
annotations += new_sem_annos
# Resort by min edit_idx in anno
annotations = sorted(annotations, key=lambda x: min(x["edit_idxs"]))
return annotations, num_fixes, swaps
class DiffScorer:
def __init__(self, type="equal_ratio"):
assert type in ["equal_ratio", "equal_ratio_partial", "addition_ratio", "equal_ratio_nostop", "equal_ratio_partial_nostop"]
self.type = type
self.nostop = "nostop" in type
self.partial = "partial" in type
def score_one(self, text1, text2):
if self.type == "addition_ratio":
return compute_addition_ratio(text1, text2)
else:
return compute_equal_ratio(text1, text2, partial=self.partial, remove_stop_ws=self.nostop)
def score(self, texts1, texts2):
assert len(texts1) == len(texts2)
scores = []
for text1, text2 in zip(texts1, texts2):
scores.append(self.score_one(text1, text2))
return {"scores": scores}
if __name__ == "__main__":
# text1 = "Yelko Gómez (born March 9, 1989, Chiriquí Province) is a Panamanian cyclist riding for Wilier Panama."
# text2 = "Yelko Gómez (born March 9, 1989 in Chiriquí Province) is a Panamanian cyclist. He is riding for Hyundai-Momi-Continental."
# print(get_edit_operations(text1, text2, split_replace=True, split_sentences=True))
r_content = 'Frederick Hubbard "Fred" Gwynne (10 July 1926 – 2 July 1993) was an American actor. Gwynne is best known for his roles as Francis Muldoon and Herman Munster in the 1960s sitcoms Car 54, Where Are You? and The Munsters, respectively, and as Jud Crandall in "Pet Semetary."'
s_content = "Fred Gwynne (July 10,1926-1993) was an American actor. He was best known for his role as Herman Munster on The Munsters. He also wrote a series of children's books."
# edits = get_edit_operations(r_content, s_content, split_replace=True, split_sentences=True)
print(make_colored_text(r_content, s_content, split_replace=True, split_sentences=True))
print("---")
# print("[%.3f] %s "% (compute_equal_ratio(text1, text2, remove_stop_ws=False), make_colored_text(text1, text2, remove_stop_ws=False)))
# print("------")
# print("[%.3f] %s "% (compute_equal_ratio(text1, text2, remove_stop_ws=True), make_colored_text(text1, text2, remove_stop_ws=True)))