From c66c50b1b516a1f0a7207a4ddf947a940469f30f Mon Sep 17 00:00:00 2001
From: Vidar Tonaas Fauske <vidartf@gmail.com>
Date: Thu, 20 Apr 2017 13:27:42 +0200
Subject: [PATCH 1/4] Add caching for mimedata comparison

It seems that with the hierarchical algorithm we end up comparing the
same things several times. Due to their low level in the hierarchy, and
comparatively large size, MIME data comparisons are especially affected
by this. For this reason, we simply add a LRU cache on the comparison.
---
 nbdime/diffing/notebooks.py | 5 +++++
 setup.py                    | 1 +
 2 files changed, 6 insertions(+)

diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py
index 1904f830..4e7705ce 100644
--- a/nbdime/diffing/notebooks.py
+++ b/nbdime/diffing/notebooks.py
@@ -18,6 +18,10 @@
 from collections import defaultdict
 from six import string_types
 from six.moves import zip
+try:
+    from functools import lru_cache
+except ImportError:
+    from backports.functools_lru_cache import lru_cache
 
 from ..diff_format import MappingDiffBuilder, DiffOp
 
@@ -90,6 +94,7 @@ def compare_base64_strict(x, y):
     return x == y
 
 
+@lru_cache(maxsize=1024, typed=False)
 def _compare_mimedata(mimetype, x, y, comp_text, comp_base64):
     mimetype = mimetype.lower()
 
diff --git a/setup.py b/setup.py
index ca26f9ce..b11530db 100644
--- a/setup.py
+++ b/setup.py
@@ -258,6 +258,7 @@ def run(self):
 
     ':python_version == "2.7"': [
         'backports.shutil_which',
+        'backports.functools_lru_cache',
     ],
 }
 

From ce8e5b7b15af3535fc21e6782de7b6456f0ffe46 Mon Sep 17 00:00:00 2001
From: Vidar Tonaas Fauske <vidartf@gmail.com>
Date: Thu, 20 Apr 2017 13:28:30 +0200
Subject: [PATCH 2/4] Cleanup/adjustments

---
 nbdime/diffing/notebooks.py | 11 +++++++----
 nbdime/profiling.py         | 12 +++++++-----
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py
index 4e7705ce..ae5d011a 100644
--- a/nbdime/diffing/notebooks.py
+++ b/nbdime/diffing/notebooks.py
@@ -3,8 +3,6 @@
 # Copyright (c) Jupyter Development Team.
 # Distributed under the terms of the Modified BSD License.
 
-from __future__ import unicode_literals
-
 """Tools for diffing notebooks.
 
 All diff tools here currently assumes the notebooks have already been
@@ -12,6 +10,8 @@
 Up- and down-conversion is handled by nbformat.
 """
 
+from __future__ import unicode_literals
+
 import operator
 import re
 import copy
@@ -362,17 +362,20 @@ def diff_single_outputs(a, b, path="/cells/*/outputs/*",
     if a.output_type in ("execute_result", "display_data"):
         di = MappingDiffBuilder()
 
+        # Separate data from output during diffing:
         tmp_data = a.pop('data')
-        a_conj = copy.deepcopy(a)
-        a.data = tmp_data
+        a_conj = copy.deepcopy(a)  # Output without data
+        a.data = tmp_data          # Restore output
         tmp_data = b.pop('data')
         b_conj = copy.deepcopy(b)
         b.data = tmp_data
+        # Only diff outputs without data:
         dd_conj = diff(a_conj, b_conj)
         if dd_conj:
             for e in dd_conj:
                 di.append(e)
 
+        # Only diff data:
         dd = diff_mime_bundle(a.data, b.data, path=path+"/data")
         if dd:
             di.patch("data", dd)
diff --git a/nbdime/profiling.py b/nbdime/profiling.py
index 87340b32..64c2710f 100644
--- a/nbdime/profiling.py
+++ b/nbdime/profiling.py
@@ -55,12 +55,13 @@ def __init__(self, verbose=False, enabled=True):
 
     @contextlib.contextmanager
     def time(self, key):
+        if not self.enabled:
+            yield
+            return
         start = time.time()
         yield
         end = time.time()
         secs = end - start
-        if not self.enabled:
-            return
         if key in self.map:
             self.map[key]['time'] += secs
             self.map[key]['calls'] += 1
@@ -88,8 +89,8 @@ def __str__(self):
         for key, data in items:
             time = data['time']
             calls = data['calls']
-            lines.append((key, calls, time))
-        return tabulate(lines, headers=['Key', 'Calls', 'Time'])
+            lines.append((key, calls, time, time / calls))
+        return tabulate(lines, headers=['Key', 'Calls', 'Time', 'Time/Call'])
 
 
 timer = TimePaths(enabled=False)
@@ -98,7 +99,8 @@ def __str__(self):
 def profile_diff_paths(args=None):
     import nbdime.nbdiffapp
     import nbdime.profiling
-    nbdime.nbdiffapp.main(args)
+    with nbdime.profiling.timer.enable():
+        nbdime.nbdiffapp.main(args)
     data = str(nbdime.profiling.timer)
     print(data)
 

From 4e8b96ac13f0a6a101544cf4c7c91e33b32fcb14 Mon Sep 17 00:00:00 2001
From: Vidar Tonaas Fauske <vidartf@gmail.com>
Date: Thu, 20 Apr 2017 14:04:58 +0200
Subject: [PATCH 3/4] Fixes test error + further optimization

We can only cache string type mime data comparisons (hashable). Also
added a cache on compare_text_approximate, as profiling showed that a
hash there gives a reasonable amount of hits.
---
 nbdime/diffing/notebooks.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py
index ae5d011a..adc78107 100644
--- a/nbdime/diffing/notebooks.py
+++ b/nbdime/diffing/notebooks.py
@@ -53,6 +53,7 @@
 #       an argument instead of separate functions.
 
 
+@lru_cache(maxsize=1024, typed=False)
 def compare_text_approximate(x, y):
     # Fast cutoff when one is empty
     if bool(x) != bool(y):
@@ -94,7 +95,15 @@ def compare_base64_strict(x, y):
     return x == y
 
 
-@lru_cache(maxsize=1024, typed=False)
+@lru_cache(maxsize=128, typed=False)
+def _compare_mimedata_strings(x, y, comp_text, comp_base64):
+    # Most likely base64 encoded data
+    if _base64.match(x):
+        return comp_base64(x, y)
+    else:
+        return comp_text(x, y)
+
+
 def _compare_mimedata(mimetype, x, y, comp_text, comp_base64):
     mimetype = mimetype.lower()
 
@@ -113,12 +122,7 @@ def _compare_mimedata(mimetype, x, y, comp_text, comp_base64):
     # TODO: Compare binary images?
     #if mimetype.startswith("image/"):
     if isinstance(x, string_types) and isinstance(y, string_types):
-        # Most likely base64 encoded data
-        if _base64.match(x):
-            return comp_base64(x, y)
-        else:
-            return comp_text(x, y)
-
+        _compare_mimedata_strings(x, y, comp_text, comp_base64)
     # Fallback to exactly equal
     return x == y
 

From a5f5423775862e3c0df4b3a78652a66681d66425 Mon Sep 17 00:00:00 2001
From: Vidar Tonaas Fauske <vidartf@gmail.com>
Date: Fri, 21 Apr 2017 13:25:09 +0200
Subject: [PATCH 4/4] Also allow profiling timer to work as decorator

---
 nbdime/profiling.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/nbdime/profiling.py b/nbdime/profiling.py
index 64c2710f..e0aadbec 100644
--- a/nbdime/profiling.py
+++ b/nbdime/profiling.py
@@ -40,6 +40,7 @@
 import time
 import contextlib
 from tabulate import tabulate
+from functools import wraps
 
 
 def _sort_time(value):
@@ -68,6 +69,18 @@ def time(self, key):
         else:
             self.map[key] = dict(time=secs, calls=1)
 
+    def profile(self, key=None):
+        def decorator(function):
+            nonlocal key
+            if key is None:
+                key = function.__name__ or 'unknown'
+            @wraps(function)
+            def inner(*args, **kwargs):
+                with self.time(key):
+                    return function(*args, **kwargs)
+            return inner
+        return decorator
+
     @contextlib.contextmanager
     def enable(self):
         old = self.enabled