From 82c4e7cba692794df4bd1003f633b60b9fac21fe Mon Sep 17 00:00:00 2001
From: Vidar Tonaas Fauske <vidartf@gmail.com>
Date: Thu, 12 Jan 2017 17:46:43 +0100
Subject: [PATCH 1/4] Optimize diff: Use autojunk in approx text compare

---
 nbdime/diffing/generic.py   | 4 ++--
 nbdime/diffing/notebooks.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/nbdime/diffing/generic.py b/nbdime/diffing/generic.py
index 1686cef3..975f51f3 100644
--- a/nbdime/diffing/generic.py
+++ b/nbdime/diffing/generic.py
@@ -33,7 +33,7 @@ def default_differs():
     return defaultdict(lambda: diff)
 
 
-def compare_strings_approximate(x, y, threshold=0.7):
+def compare_strings_approximate(x, y, threshold=0.7, autojunk=False):
     "Compare to strings with approximate heuristics."
     # TODO: Add configuration framework
     # TODO: Tune threshold with realistic sources
@@ -63,7 +63,7 @@ def compare_strings_approximate(x, y, threshold=0.7):
 
     # So the heavy ratio function is only used for close calls.
     # s = difflib.SequenceMatcher(lambda c: c in (" ", "\t"), x, y, autojunk=False)
-    s = difflib.SequenceMatcher(None, x, y, autojunk=False)
+    s = difflib.SequenceMatcher(None, x, y, autojunk=autojunk)
 
     # Use only the fast ratio approximations first
     if s.real_quick_ratio() < threshold:
diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py
index 1904f830..e9ab4820 100644
--- a/nbdime/diffing/notebooks.py
+++ b/nbdime/diffing/notebooks.py
@@ -69,7 +69,7 @@ def compare_text_approximate(x, y):
     if nx < shortlen and ny < shortlen:
         return True
 
-    return compare_strings_approximate(x, y, threshold=0.7)
+    return compare_strings_approximate(x, y, threshold=0.7, autojunk=True)
 
 
 def compare_text_strict(x, y):

From 11ebe75fa358525b28217044898f4f6b476f7357 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Sandve=20Aln=C3=A6s?= <martinal@simula.no>
Date: Thu, 19 Jan 2017 12:47:24 +0000
Subject: [PATCH 2/4] Don't use autojunk. Instead drop the most expensive
 .ratio() call for approximate alignment. Also improve the text comparison a
 bit more by making cutoffs for small text/plain situations more specific.

---
 nbdime/diffing/generic.py   | 11 +++--
 nbdime/diffing/notebooks.py | 85 ++++++++++++++++++++-----------------
 2 files changed, 53 insertions(+), 43 deletions(-)

diff --git a/nbdime/diffing/generic.py b/nbdime/diffing/generic.py
index 975f51f3..e0d685b5 100644
--- a/nbdime/diffing/generic.py
+++ b/nbdime/diffing/generic.py
@@ -33,7 +33,7 @@ def default_differs():
     return defaultdict(lambda: diff)
 
 
-def compare_strings_approximate(x, y, threshold=0.7, autojunk=False):
+def compare_strings_approximate(x, y, threshold=0.7, quick=False):
     "Compare to strings with approximate heuristics."
     # TODO: Add configuration framework
     # TODO: Tune threshold with realistic sources
@@ -63,14 +63,19 @@ def compare_strings_approximate(x, y, threshold=0.7, autojunk=False):
 
     # So the heavy ratio function is only used for close calls.
     # s = difflib.SequenceMatcher(lambda c: c in (" ", "\t"), x, y, autojunk=False)
-    s = difflib.SequenceMatcher(None, x, y, autojunk=autojunk)
+    s = difflib.SequenceMatcher(None, x, y, autojunk=False)
 
     # Use only the fast ratio approximations first
     if s.real_quick_ratio() < threshold:
         return False
     if s.quick_ratio() < threshold:
         return False
-    return s.ratio() > threshold
+
+    # Skip slower and stricter check unless if quick is set
+    if quick:
+        return True
+    else:
+        return s.ratio() > threshold
 
 
 def diff(a, b, path="", predicates=None, differs=None):
diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py
index e9ab4820..d6335e17 100644
--- a/nbdime/diffing/notebooks.py
+++ b/nbdime/diffing/notebooks.py
@@ -35,6 +35,7 @@
 
 re_pointer = re.compile(r"0x[a-f0-9]{8,16}", re.IGNORECASE)
 
+re_number = re.compile(r"^[+-]?[0-9]*[.]?[0-9]*[eE]?[+-]?[0-9]*$")
 
 # List of mimes we can diff recursively
 _split_mimes = (
@@ -45,73 +46,77 @@
     )
 
 
-# TODO: Maybe cleaner to make the split between strict/approximate
-#       an argument instead of separate functions.
+# TODO: Rename compare_* -> align_* to better reflect what this is used for?
 
 
-def compare_text_approximate(x, y):
-    # Fast cutoff when one is empty
-    if bool(x) != bool(y):
-        return False
+# TODO: Maybe cleaner to make the split between strict/approximate
+#       an argument instead of separate functions.
 
-    if isinstance(x, list):
-        x = "".join(x)
-    if isinstance(y, list):
-        y = "".join(y)
-
-    # TODO: Review whether this is wanted.
-    #       The motivation is to align tiny
-    #       strings in outputs such as a single number.
-    # Allow aligning short strings without comparison
-    nx = len(x)
-    ny = len(y)
-    shortlen = 10  # TODO: Add this to configuration framework
-    if nx < shortlen and ny < shortlen:
-        return True
 
-    return compare_strings_approximate(x, y, threshold=0.7, autojunk=True)
+def compare_text(x, y, strict):
+    if strict:
+        return compare_strings_approximate(x, y, threshold=0.95)
+    else:
+        return compare_strings_approximate(x, y, threshold=0.7, quick=False)
 
+def compare_text_approximate(x, y):
+    return compare_text(x, y, strict=False)
 
 def compare_text_strict(x, y):
-    # TODO: Doesn't have to be 100% equal here?
-    if isinstance(x, list):
-        x = "".join(x)
-    if isinstance(y, list):
-        y = "".join(y)
-    if len(x) == len(y) and x == y:
-        return True
-    return compare_strings_approximate(x, y, threshold=0.95)
+    return compare_text(x, y, strict=True)
 
 
 def compare_base64_strict(x, y):
     if len(x) != len(y):
         return False
-    # TODO: Handle base64 data another way?
     return x == y
 
 
-def _compare_mimedata(mimetype, x, y, comp_text, comp_base64):
+compare_text_plain_strict = compare_text_strict
+
+def compare_text_plain_approximate(x, y):
+    assert isinstance(x, string_types)
+    assert isinstance(y, string_types)
+
+    # Special cutoffs for short texts
+    # TODO: Make this configurable behaviour? Or drop it completely?
+    shortlen = 256  # Magic number larger than typical single lines
+    if len(x) == len(y) and len(x) < shortlen:
+        # Align if differing by pointer values only
+        xsplit = re_pointer.split(x)
+        ysplit = re_pointer.split(y)
+        if xsplit == ysplit:
+            return True
+
+        # Align simple numbers
+        if re_number.match(x) and re_number.match(y):
+            return True
+
+    # Fallback to regular approximate text comparison
+    return compare_text_approximate(x, y)
+
+
+def _compare_mimedata(mimetype, x, y, comp_text, comp_text_plain, comp_base64):
     mimetype = mimetype.lower()
 
-    # TODO: Test this. Match repr-style oneliners with random pointer
+    # Special case cutoffs for simple text/plain strings
     if mimetype == "text/plain":
-        # Allow short texts to only differ by pointer values
-        if "\n" not in x and "\n" not in y:
-            xsplit = re_pointer.split(x)
-            ysplit = re_pointer.split(y)
-            if xsplit == ysplit:
-                return True
+        return comp_text_plain(x, y)
 
+    # Pure text comparison
     if mimetype.startswith("text/"):
         return comp_text(x, y)
 
     # TODO: Compare binary images?
     #if mimetype.startswith("image/"):
+
+    # Text values but not text/ type
     if isinstance(x, string_types) and isinstance(y, string_types):
         # Most likely base64 encoded data
         if _base64.match(x):
             return comp_base64(x, y)
         else:
+            # If not fallback to pure text comparison
             return comp_text(x, y)
 
     # Fallback to exactly equal
@@ -120,12 +125,12 @@ def _compare_mimedata(mimetype, x, y, comp_text, comp_base64):
 
 def compare_mimedata_approximate(mimetype, x, y):
     return _compare_mimedata(mimetype, x, y,
-        compare_text_approximate, compare_base64_strict)
+        compare_text_approximate, compare_text_plain_approximate, compare_base64_strict)
 
 
 def compare_mimedata_strict(mimetype, x, y):
     return _compare_mimedata(mimetype, x, y,
-        compare_text_strict, compare_base64_strict)
+        compare_text_strict, compare_text_plain_strict, compare_base64_strict)
 
 
 def compare_mimebundle_approximate(x, y):

From 32122024c2de03bb5fbdd6b9a01dd565e580eb8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Sandve=20Aln=C3=A6s?= <martinal@simula.no>
Date: Thu, 19 Jan 2017 18:05:54 +0000
Subject: [PATCH 3/4] Switch quick to True...

---
 nbdime/diffing/notebooks.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py
index d6335e17..f8e3f6fe 100644
--- a/nbdime/diffing/notebooks.py
+++ b/nbdime/diffing/notebooks.py
@@ -53,17 +53,11 @@
 #       an argument instead of separate functions.
 
 
-def compare_text(x, y, strict):
-    if strict:
-        return compare_strings_approximate(x, y, threshold=0.95)
-    else:
-        return compare_strings_approximate(x, y, threshold=0.7, quick=False)
-
 def compare_text_approximate(x, y):
-    return compare_text(x, y, strict=False)
+    return compare_strings_approximate(x, y, threshold=0.7, quick=True)
 
 def compare_text_strict(x, y):
-    return compare_text(x, y, strict=True)
+    return compare_strings_approximate(x, y, threshold=0.95)
 
 
 def compare_base64_strict(x, y):

From 490819a7de898889da0aac98a96db7e1de1fd131 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Sandve=20Aln=C3=A6s?= <martinal@simula.no>
Date: Fri, 20 Jan 2017 11:52:13 +0000
Subject: [PATCH 4/4] Drop quick parameter to string comparison.

---
 nbdime/diffing/generic.py   | 9 ++-------
 nbdime/diffing/notebooks.py | 3 ++-
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/nbdime/diffing/generic.py b/nbdime/diffing/generic.py
index e0d685b5..1686cef3 100644
--- a/nbdime/diffing/generic.py
+++ b/nbdime/diffing/generic.py
@@ -33,7 +33,7 @@ def default_differs():
     return defaultdict(lambda: diff)
 
 
-def compare_strings_approximate(x, y, threshold=0.7, quick=False):
+def compare_strings_approximate(x, y, threshold=0.7):
     "Compare to strings with approximate heuristics."
     # TODO: Add configuration framework
     # TODO: Tune threshold with realistic sources
@@ -70,12 +70,7 @@ def compare_strings_approximate(x, y, threshold=0.7, quick=False):
         return False
     if s.quick_ratio() < threshold:
         return False
-
-    # Skip slower and stricter check unless if quick is set
-    if quick:
-        return True
-    else:
-        return s.ratio() > threshold
+    return s.ratio() > threshold
 
 
 def diff(a, b, path="", predicates=None, differs=None):
diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py
index f8e3f6fe..35ba1733 100644
--- a/nbdime/diffing/notebooks.py
+++ b/nbdime/diffing/notebooks.py
@@ -54,7 +54,8 @@
 
 
 def compare_text_approximate(x, y):
-    return compare_strings_approximate(x, y, threshold=0.7, quick=True)
+    return compare_strings_approximate(x, y, threshold=0.7)
+
 
 def compare_text_strict(x, y):
     return compare_strings_approximate(x, y, threshold=0.95)