From c66c50b1b516a1f0a7207a4ddf947a940469f30f Mon Sep 17 00:00:00 2001 From: Vidar Tonaas Fauske Date: Thu, 20 Apr 2017 13:27:42 +0200 Subject: [PATCH 1/4] Add caching for mimedata comparison It seems that with the hierarchical algorithm we end up comparing the same things several times. Due to their low level in the hierarchy, and comparatively large size, MIME data comparisons are especially affected by this. For this reason, we simply add a LRU cache on the comparison. --- nbdime/diffing/notebooks.py | 5 +++++ setup.py | 1 + 2 files changed, 6 insertions(+) diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py index 1904f830..4e7705ce 100644 --- a/nbdime/diffing/notebooks.py +++ b/nbdime/diffing/notebooks.py @@ -18,6 +18,10 @@ from collections import defaultdict from six import string_types from six.moves import zip +try: + from functools import lru_cache +except ImportError: + from backports.functools_lru_cache import lru_cache from ..diff_format import MappingDiffBuilder, DiffOp @@ -90,6 +94,7 @@ def compare_base64_strict(x, y): return x == y +@lru_cache(maxsize=1024, typed=False) def _compare_mimedata(mimetype, x, y, comp_text, comp_base64): mimetype = mimetype.lower() diff --git a/setup.py b/setup.py index ca26f9ce..b11530db 100644 --- a/setup.py +++ b/setup.py @@ -258,6 +258,7 @@ def run(self): ':python_version == "2.7"': [ 'backports.shutil_which', + 'backports.functools_lru_cache', ], } From ce8e5b7b15af3535fc21e6782de7b6456f0ffe46 Mon Sep 17 00:00:00 2001 From: Vidar Tonaas Fauske Date: Thu, 20 Apr 2017 13:28:30 +0200 Subject: [PATCH 2/4] Cleanup/adjustments --- nbdime/diffing/notebooks.py | 11 +++++++---- nbdime/profiling.py | 12 +++++++----- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py index 4e7705ce..ae5d011a 100644 --- a/nbdime/diffing/notebooks.py +++ b/nbdime/diffing/notebooks.py @@ -3,8 +3,6 @@ # Copyright (c) Jupyter Development Team. # Distributed under the terms of the Modified BSD License. -from __future__ import unicode_literals - """Tools for diffing notebooks. All diff tools here currently assumes the notebooks have already been @@ -12,6 +10,8 @@ Up- and down-conversion is handled by nbformat. """ +from __future__ import unicode_literals + import operator import re import copy @@ -362,17 +362,20 @@ def diff_single_outputs(a, b, path="/cells/*/outputs/*", if a.output_type in ("execute_result", "display_data"): di = MappingDiffBuilder() + # Separate data from output during diffing: tmp_data = a.pop('data') - a_conj = copy.deepcopy(a) - a.data = tmp_data + a_conj = copy.deepcopy(a) # Output without data + a.data = tmp_data # Restore output tmp_data = b.pop('data') b_conj = copy.deepcopy(b) b.data = tmp_data + # Only diff outputs without data: dd_conj = diff(a_conj, b_conj) if dd_conj: for e in dd_conj: di.append(e) + # Only diff data: dd = diff_mime_bundle(a.data, b.data, path=path+"/data") if dd: di.patch("data", dd) diff --git a/nbdime/profiling.py b/nbdime/profiling.py index 87340b32..64c2710f 100644 --- a/nbdime/profiling.py +++ b/nbdime/profiling.py @@ -55,12 +55,13 @@ def __init__(self, verbose=False, enabled=True): @contextlib.contextmanager def time(self, key): + if not self.enabled: + yield + return start = time.time() yield end = time.time() secs = end - start - if not self.enabled: - return if key in self.map: self.map[key]['time'] += secs self.map[key]['calls'] += 1 @@ -88,8 +89,8 @@ def __str__(self): for key, data in items: time = data['time'] calls = data['calls'] - lines.append((key, calls, time)) - return tabulate(lines, headers=['Key', 'Calls', 'Time']) + lines.append((key, calls, time, time / calls)) + return tabulate(lines, headers=['Key', 'Calls', 'Time', 'Time/Call']) timer = TimePaths(enabled=False) @@ -98,7 +99,8 @@ def __str__(self): def profile_diff_paths(args=None): import nbdime.nbdiffapp import nbdime.profiling - nbdime.nbdiffapp.main(args) + with nbdime.profiling.timer.enable(): + nbdime.nbdiffapp.main(args) data = str(nbdime.profiling.timer) print(data) From 4e8b96ac13f0a6a101544cf4c7c91e33b32fcb14 Mon Sep 17 00:00:00 2001 From: Vidar Tonaas Fauske Date: Thu, 20 Apr 2017 14:04:58 +0200 Subject: [PATCH 3/4] Fixes test error + further optimization We can only cache string type mime data comparisons (hashable). Also added a cache on compare_text_approximate, as profiling showed that a hash there gives a reasonable amount of hits. --- nbdime/diffing/notebooks.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py index ae5d011a..adc78107 100644 --- a/nbdime/diffing/notebooks.py +++ b/nbdime/diffing/notebooks.py @@ -53,6 +53,7 @@ # an argument instead of separate functions. +@lru_cache(maxsize=1024, typed=False) def compare_text_approximate(x, y): # Fast cutoff when one is empty if bool(x) != bool(y): @@ -94,7 +95,15 @@ def compare_base64_strict(x, y): return x == y -@lru_cache(maxsize=1024, typed=False) +@lru_cache(maxsize=128, typed=False) +def _compare_mimedata_strings(x, y, comp_text, comp_base64): + # Most likely base64 encoded data + if _base64.match(x): + return comp_base64(x, y) + else: + return comp_text(x, y) + + def _compare_mimedata(mimetype, x, y, comp_text, comp_base64): mimetype = mimetype.lower() @@ -113,12 +122,7 @@ def _compare_mimedata(mimetype, x, y, comp_text, comp_base64): # TODO: Compare binary images? #if mimetype.startswith("image/"): if isinstance(x, string_types) and isinstance(y, string_types): - # Most likely base64 encoded data - if _base64.match(x): - return comp_base64(x, y) - else: - return comp_text(x, y) - + _compare_mimedata_strings(x, y, comp_text, comp_base64) # Fallback to exactly equal return x == y From a5f5423775862e3c0df4b3a78652a66681d66425 Mon Sep 17 00:00:00 2001 From: Vidar Tonaas Fauske Date: Fri, 21 Apr 2017 13:25:09 +0200 Subject: [PATCH 4/4] Also allow profiling timer to work as decorator --- nbdime/profiling.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/nbdime/profiling.py b/nbdime/profiling.py index 64c2710f..e0aadbec 100644 --- a/nbdime/profiling.py +++ b/nbdime/profiling.py @@ -40,6 +40,7 @@ import time import contextlib from tabulate import tabulate +from functools import wraps def _sort_time(value): @@ -68,6 +69,18 @@ def time(self, key): else: self.map[key] = dict(time=secs, calls=1) + def profile(self, key=None): + def decorator(function): + nonlocal key + if key is None: + key = function.__name__ or 'unknown' + @wraps(function) + def inner(*args, **kwargs): + with self.time(key): + return function(*args, **kwargs) + return inner + return decorator + @contextlib.contextmanager def enable(self): old = self.enabled