From 82c4e7cba692794df4bd1003f633b60b9fac21fe Mon Sep 17 00:00:00 2001 From: Vidar Tonaas Fauske Date: Thu, 12 Jan 2017 17:46:43 +0100 Subject: [PATCH 1/4] Optimize diff: Use autojunk in approx text compare --- nbdime/diffing/generic.py | 4 ++-- nbdime/diffing/notebooks.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nbdime/diffing/generic.py b/nbdime/diffing/generic.py index 1686cef3..975f51f3 100644 --- a/nbdime/diffing/generic.py +++ b/nbdime/diffing/generic.py @@ -33,7 +33,7 @@ def default_differs(): return defaultdict(lambda: diff) -def compare_strings_approximate(x, y, threshold=0.7): +def compare_strings_approximate(x, y, threshold=0.7, autojunk=False): "Compare to strings with approximate heuristics." # TODO: Add configuration framework # TODO: Tune threshold with realistic sources @@ -63,7 +63,7 @@ def compare_strings_approximate(x, y, threshold=0.7): # So the heavy ratio function is only used for close calls. # s = difflib.SequenceMatcher(lambda c: c in (" ", "\t"), x, y, autojunk=False) - s = difflib.SequenceMatcher(None, x, y, autojunk=False) + s = difflib.SequenceMatcher(None, x, y, autojunk=autojunk) # Use only the fast ratio approximations first if s.real_quick_ratio() < threshold: diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py index 1904f830..e9ab4820 100644 --- a/nbdime/diffing/notebooks.py +++ b/nbdime/diffing/notebooks.py @@ -69,7 +69,7 @@ def compare_text_approximate(x, y): if nx < shortlen and ny < shortlen: return True - return compare_strings_approximate(x, y, threshold=0.7) + return compare_strings_approximate(x, y, threshold=0.7, autojunk=True) def compare_text_strict(x, y): From 11ebe75fa358525b28217044898f4f6b476f7357 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Sandve=20Aln=C3=A6s?= Date: Thu, 19 Jan 2017 12:47:24 +0000 Subject: [PATCH 2/4] Don't use autojunk. Instead drop the most expensive .ratio() call for approximate alignment. Also improve the text comparison a bit more by making cutoffs for small text/plain situations more specific. --- nbdime/diffing/generic.py | 11 +++-- nbdime/diffing/notebooks.py | 85 ++++++++++++++++++++----------------- 2 files changed, 53 insertions(+), 43 deletions(-) diff --git a/nbdime/diffing/generic.py b/nbdime/diffing/generic.py index 975f51f3..e0d685b5 100644 --- a/nbdime/diffing/generic.py +++ b/nbdime/diffing/generic.py @@ -33,7 +33,7 @@ def default_differs(): return defaultdict(lambda: diff) -def compare_strings_approximate(x, y, threshold=0.7, autojunk=False): +def compare_strings_approximate(x, y, threshold=0.7, quick=False): "Compare to strings with approximate heuristics." # TODO: Add configuration framework # TODO: Tune threshold with realistic sources @@ -63,14 +63,19 @@ def compare_strings_approximate(x, y, threshold=0.7, autojunk=False): # So the heavy ratio function is only used for close calls. # s = difflib.SequenceMatcher(lambda c: c in (" ", "\t"), x, y, autojunk=False) - s = difflib.SequenceMatcher(None, x, y, autojunk=autojunk) + s = difflib.SequenceMatcher(None, x, y, autojunk=False) # Use only the fast ratio approximations first if s.real_quick_ratio() < threshold: return False if s.quick_ratio() < threshold: return False - return s.ratio() > threshold + + # Skip slower and stricter check unless if quick is set + if quick: + return True + else: + return s.ratio() > threshold def diff(a, b, path="", predicates=None, differs=None): diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py index e9ab4820..d6335e17 100644 --- a/nbdime/diffing/notebooks.py +++ b/nbdime/diffing/notebooks.py @@ -35,6 +35,7 @@ re_pointer = re.compile(r"0x[a-f0-9]{8,16}", re.IGNORECASE) +re_number = re.compile(r"^[+-]?[0-9]*[.]?[0-9]*[eE]?[+-]?[0-9]*$") # List of mimes we can diff recursively _split_mimes = ( @@ -45,73 +46,77 @@ ) -# TODO: Maybe cleaner to make the split between strict/approximate -# an argument instead of separate functions. +# TODO: Rename compare_* -> align_* to better reflect what this is used for? -def compare_text_approximate(x, y): - # Fast cutoff when one is empty - if bool(x) != bool(y): - return False +# TODO: Maybe cleaner to make the split between strict/approximate +# an argument instead of separate functions. - if isinstance(x, list): - x = "".join(x) - if isinstance(y, list): - y = "".join(y) - - # TODO: Review whether this is wanted. - # The motivation is to align tiny - # strings in outputs such as a single number. - # Allow aligning short strings without comparison - nx = len(x) - ny = len(y) - shortlen = 10 # TODO: Add this to configuration framework - if nx < shortlen and ny < shortlen: - return True - return compare_strings_approximate(x, y, threshold=0.7, autojunk=True) +def compare_text(x, y, strict): + if strict: + return compare_strings_approximate(x, y, threshold=0.95) + else: + return compare_strings_approximate(x, y, threshold=0.7, quick=False) +def compare_text_approximate(x, y): + return compare_text(x, y, strict=False) def compare_text_strict(x, y): - # TODO: Doesn't have to be 100% equal here? - if isinstance(x, list): - x = "".join(x) - if isinstance(y, list): - y = "".join(y) - if len(x) == len(y) and x == y: - return True - return compare_strings_approximate(x, y, threshold=0.95) + return compare_text(x, y, strict=True) def compare_base64_strict(x, y): if len(x) != len(y): return False - # TODO: Handle base64 data another way? return x == y -def _compare_mimedata(mimetype, x, y, comp_text, comp_base64): +compare_text_plain_strict = compare_text_strict + +def compare_text_plain_approximate(x, y): + assert isinstance(x, string_types) + assert isinstance(y, string_types) + + # Special cutoffs for short texts + # TODO: Make this configurable behaviour? Or drop it completely? + shortlen = 256 # Magic number larger than typical single lines + if len(x) == len(y) and len(x) < shortlen: + # Align if differing by pointer values only + xsplit = re_pointer.split(x) + ysplit = re_pointer.split(y) + if xsplit == ysplit: + return True + + # Align simple numbers + if re_number.match(x) and re_number.match(y): + return True + + # Fallback to regular approximate text comparison + return compare_text_approximate(x, y) + + +def _compare_mimedata(mimetype, x, y, comp_text, comp_text_plain, comp_base64): mimetype = mimetype.lower() - # TODO: Test this. Match repr-style oneliners with random pointer + # Special case cutoffs for simple text/plain strings if mimetype == "text/plain": - # Allow short texts to only differ by pointer values - if "\n" not in x and "\n" not in y: - xsplit = re_pointer.split(x) - ysplit = re_pointer.split(y) - if xsplit == ysplit: - return True + return comp_text_plain(x, y) + # Pure text comparison if mimetype.startswith("text/"): return comp_text(x, y) # TODO: Compare binary images? #if mimetype.startswith("image/"): + + # Text values but not text/ type if isinstance(x, string_types) and isinstance(y, string_types): # Most likely base64 encoded data if _base64.match(x): return comp_base64(x, y) else: + # If not fallback to pure text comparison return comp_text(x, y) # Fallback to exactly equal @@ -120,12 +125,12 @@ def _compare_mimedata(mimetype, x, y, comp_text, comp_base64): def compare_mimedata_approximate(mimetype, x, y): return _compare_mimedata(mimetype, x, y, - compare_text_approximate, compare_base64_strict) + compare_text_approximate, compare_text_plain_approximate, compare_base64_strict) def compare_mimedata_strict(mimetype, x, y): return _compare_mimedata(mimetype, x, y, - compare_text_strict, compare_base64_strict) + compare_text_strict, compare_text_plain_strict, compare_base64_strict) def compare_mimebundle_approximate(x, y): From 32122024c2de03bb5fbdd6b9a01dd565e580eb8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Sandve=20Aln=C3=A6s?= Date: Thu, 19 Jan 2017 18:05:54 +0000 Subject: [PATCH 3/4] Switch quick to True... --- nbdime/diffing/notebooks.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py index d6335e17..f8e3f6fe 100644 --- a/nbdime/diffing/notebooks.py +++ b/nbdime/diffing/notebooks.py @@ -53,17 +53,11 @@ # an argument instead of separate functions. -def compare_text(x, y, strict): - if strict: - return compare_strings_approximate(x, y, threshold=0.95) - else: - return compare_strings_approximate(x, y, threshold=0.7, quick=False) - def compare_text_approximate(x, y): - return compare_text(x, y, strict=False) + return compare_strings_approximate(x, y, threshold=0.7, quick=True) def compare_text_strict(x, y): - return compare_text(x, y, strict=True) + return compare_strings_approximate(x, y, threshold=0.95) def compare_base64_strict(x, y): From 490819a7de898889da0aac98a96db7e1de1fd131 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Sandve=20Aln=C3=A6s?= Date: Fri, 20 Jan 2017 11:52:13 +0000 Subject: [PATCH 4/4] Drop quick parameter to string comparison. --- nbdime/diffing/generic.py | 9 ++------- nbdime/diffing/notebooks.py | 3 ++- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/nbdime/diffing/generic.py b/nbdime/diffing/generic.py index e0d685b5..1686cef3 100644 --- a/nbdime/diffing/generic.py +++ b/nbdime/diffing/generic.py @@ -33,7 +33,7 @@ def default_differs(): return defaultdict(lambda: diff) -def compare_strings_approximate(x, y, threshold=0.7, quick=False): +def compare_strings_approximate(x, y, threshold=0.7): "Compare to strings with approximate heuristics." # TODO: Add configuration framework # TODO: Tune threshold with realistic sources @@ -70,12 +70,7 @@ def compare_strings_approximate(x, y, threshold=0.7, quick=False): return False if s.quick_ratio() < threshold: return False - - # Skip slower and stricter check unless if quick is set - if quick: - return True - else: - return s.ratio() > threshold + return s.ratio() > threshold def diff(a, b, path="", predicates=None, differs=None): diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py index f8e3f6fe..35ba1733 100644 --- a/nbdime/diffing/notebooks.py +++ b/nbdime/diffing/notebooks.py @@ -54,7 +54,8 @@ def compare_text_approximate(x, y): - return compare_strings_approximate(x, y, threshold=0.7, quick=True) + return compare_strings_approximate(x, y, threshold=0.7) + def compare_text_strict(x, y): return compare_strings_approximate(x, y, threshold=0.95)