From b03aad812aef7a28120d09911ea55de55336e195 Mon Sep 17 00:00:00 2001 From: Vidar Tonaas Fauske Date: Thu, 12 Jan 2017 17:45:23 +0100 Subject: [PATCH 01/10] Optimize diff: Don't deepcopy data --- nbdime/diffing/notebooks.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py index dd4f79f5..2ca16a54 100644 --- a/nbdime/diffing/notebooks.py +++ b/nbdime/diffing/notebooks.py @@ -357,10 +357,12 @@ def diff_single_outputs(a, b, path="/cells/*/outputs/*", if a.output_type in ("execute_result", "display_data"): di = MappingDiffBuilder() + tmp_data = a.pop('data') a_conj = copy.deepcopy(a) - del a_conj['data'] + a.data = tmp_data + tmp_data = b.pop('data') b_conj = copy.deepcopy(b) - del b_conj['data'] + b.data = tmp_data dd_conj = diff(a_conj, b_conj) if dd_conj: for e in dd_conj: From bd4e6e877a91df15aeac74592586d0e47ec98c99 Mon Sep 17 00:00:00 2001 From: Vidar Tonaas Fauske Date: Thu, 12 Jan 2017 17:47:01 +0100 Subject: [PATCH 02/10] Cleanup --- nbdime/diffing/snakes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nbdime/diffing/snakes.py b/nbdime/diffing/snakes.py index 133dd2e7..3c421532 100644 --- a/nbdime/diffing/snakes.py +++ b/nbdime/diffing/snakes.py @@ -9,7 +9,6 @@ Utilities for computing 'snakes', or contiguous sequences of equal elements of two sequences. """ -import operator from ..diff_format import SequenceDiffBuilder from .seq_bruteforce import bruteforce_compute_snakes From 89bb72f985eefccd06e7c3dab2fb67f6bc7b08e3 Mon Sep 17 00:00:00 2001 From: Vidar Tonaas Fauske Date: Thu, 12 Jan 2017 17:47:29 +0100 Subject: [PATCH 03/10] Some profiling tools --- nbdime/profiling.py | 107 ++++++++++++++++++++++++++++++++++++++++++++ setup.py | 1 + 2 files changed, 108 insertions(+) create mode 100644 nbdime/profiling.py diff --git a/nbdime/profiling.py b/nbdime/profiling.py new file mode 100644 index 00000000..6c78fc7b --- /dev/null +++ b/nbdime/profiling.py @@ -0,0 +1,107 @@ + +"""Tools for profiling diff/merge performance. + +Sometimes diffing times can become unacceptably long. +Then the tools in this module can help you profile and figure out +where the time is spent. For a more rigorous profiling, consider using +cPython, but for initial considerations these should be helpful. + +Typical profiling usage: +Add some statements like +from nbdime.profiling import timer +with timer.time('Key to identify this segment'): + + +Then, launch `python -m nbdime.profiling notebookA.ipynb notebookB.ipynb`. + +Example case: +Output diffing is slow. By cPython/timer, the time spent has been narrowed +to comparison of MIME types (`_compare_mimedata()`). To figure out which +MIME type is taking all the time, the following is added to that function: + + with timer.time(mimetype): + + +The output becomes: + + Key Calls Time + ---------- ------- ------------ + text/html 6 74.9686 + text/plain 12 0.000500202 + image/png 5 0 + +From this we can see that the HTML comparison is really blowing up! +Further inspection reveales that there are two html outputs in each +notebook that contain an embedded base64 image. These caused the text +diff to blow up. + +""" + +import time +import contextlib +from tabulate import tabulate + + +def _sort_time(value): + time = value[1]['time'] + return -time + + +class TimePaths(object): + def __init__(self, verbose=False, enabled=True): + self.verbose = verbose + self.map = {} + self.enabled = enabled + + @contextlib.contextmanager + def time(self, key): + start = time.time() + yield + end = time.time() + secs = end - start + if not self.enabled: + return + if key in self.map: + self.map[key]['time'] += secs + self.map[key]['calls'] += 1 + else: + self.map[key] = dict(time=secs, calls=1) + + @contextlib.contextmanager + def enable(self): + old = self.enabled + self.enabled = True + yield + self.enabled = old + + @contextlib.contextmanager + def disable(self): + old = self.enabled + self.enabled = False + yield + self.enabled = old + + def __str__(self): + # First, sort by path + items = sorted(self.map.items(), key=_sort_time) + lines = [] + for key, data in items: + time = data['time'] + calls = data['calls'] + lines.append((key, calls, time)) + return tabulate(lines, headers=['Key', 'Calls', 'Time']) + + +timer = TimePaths(enabled=False) + + +def profile_diff_paths(args=None): + import nbdime.nbdiffapp + import nbdime.profiling + nbdime.nbdiffapp.main(args) + data = str(nbdime.profiling.timer) + print(data) + + +if __name__ == "__main__": + profile_diff_paths() diff --git a/setup.py b/setup.py index 7f1c5e37..d4b7637e 100644 --- a/setup.py +++ b/setup.py @@ -245,6 +245,7 @@ def run(self): 'jsonschema', 'mock', 'requests', + 'tabulate', # For profiling ], 'docs': [ 'sphinx', From c76c2e41fe819429047bc4030eb3a06dd5475b6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Sandve=20Aln=C3=A6s?= Date: Thu, 19 Jan 2017 09:12:24 +0000 Subject: [PATCH 04/10] Typo in comment. --- nbdime/profiling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nbdime/profiling.py b/nbdime/profiling.py index 6c78fc7b..87340b32 100644 --- a/nbdime/profiling.py +++ b/nbdime/profiling.py @@ -31,7 +31,7 @@ image/png 5 0 From this we can see that the HTML comparison is really blowing up! -Further inspection reveales that there are two html outputs in each +Further inspection reveals that there are two html outputs in each notebook that contain an embedded base64 image. These caused the text diff to blow up. From d658d3493a3e5cd80491301290508ecb330936bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Sandve=20Aln=C3=A6s?= Date: Thu, 19 Jan 2017 09:13:49 +0000 Subject: [PATCH 05/10] Bugfix in pointer regex. --- nbdime/diffing/notebooks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py index 2ca16a54..3a80d4cf 100644 --- a/nbdime/diffing/notebooks.py +++ b/nbdime/diffing/notebooks.py @@ -31,9 +31,9 @@ # A regexp matching common python repr-style output like # -re_repr = re.compile(r"<[a-zA-Z0-9._]+ at 0x[a-zA-Z0-9]+>") +re_repr = re.compile(r"<[a-zA-Z0-9._]+ at 0[xX][a-fA-F0-9]{8,16}>") -re_pointer = re.compile(r"0[xX][a-zA-Z0-9]{8,16}") +re_pointer = re.compile(r"0[xX][a-fA-F0-9]{8,16}") # List of mimes we can diff recursively From 44b5930e2d0d51564e6eaded4ee8462979096000 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Sandve=20Aln=C3=A6s?= Date: Thu, 19 Jan 2017 12:47:24 +0000 Subject: [PATCH 06/10] Minor adjustments --- nbdime/diffing/generic.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/nbdime/diffing/generic.py b/nbdime/diffing/generic.py index a4bd5c2f..1686cef3 100644 --- a/nbdime/diffing/generic.py +++ b/nbdime/diffing/generic.py @@ -38,8 +38,13 @@ def compare_strings_approximate(x, y, threshold=0.7): # TODO: Add configuration framework # TODO: Tune threshold with realistic sources - # Cutoff on equality (Python has fast hash functions for strings) - if x == y: + # Fast cutoff when one is empty + if bool(x) != bool(y): + return False + + # Cutoff on equality: Python has fast hash functions for strings, + # and lists of strings also works fine + if len(x) == len(y) and x == y: return True # TODO: Investigate performance and quality of this difflib ratio approach, @@ -55,9 +60,12 @@ def compare_strings_approximate(x, y, threshold=0.7): # equal items, at least in the Myers diff algorithm. # Most other comparisons will likely not be very similar, # and the (real_)quick_ratio cutoffs will speed up those. + # So the heavy ratio function is only used for close calls. # s = difflib.SequenceMatcher(lambda c: c in (" ", "\t"), x, y, autojunk=False) s = difflib.SequenceMatcher(None, x, y, autojunk=False) + + # Use only the fast ratio approximations first if s.real_quick_ratio() < threshold: return False if s.quick_ratio() < threshold: From 416c0cbf4434892a28dea2ccc3f1118cb3facbb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Sandve=20Aln=C3=A6s?= Date: Thu, 19 Jan 2017 15:55:29 +0000 Subject: [PATCH 07/10] A couple of bugfixes in merge algorithm. --- nbdime/diff_format.py | 1 + nbdime/merging/decisions.py | 6 +++--- nbdime/merging/generic.py | 5 +++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/nbdime/diff_format.py b/nbdime/diff_format.py index 4c6a2ba3..759d1d19 100644 --- a/nbdime/diff_format.py +++ b/nbdime/diff_format.py @@ -94,6 +94,7 @@ def op_removerange(key, length): def op_patch(key, diff): "Create a diff entry to patch value at key with diff." + assert diff is not None return DiffEntry(op=DiffOp.PATCH, key=key, diff=diff) diff --git a/nbdime/merging/decisions.py b/nbdime/merging/decisions.py index 1050de69..0c25cf81 100644 --- a/nbdime/merging/decisions.py +++ b/nbdime/merging/decisions.py @@ -413,10 +413,10 @@ def push_patch_decision(decision, prefix): assert dec.common_path[-1] == key, "Key %s not at end of %s" % ( key, dec.common_path) dec.common_path = dec.common_path[:-1] # pop key - dec.local_diff = [op_patch(key, dec.local_diff)] - dec.remote_diff = [op_patch(key, dec.remote_diff)] + dec.local_diff = [op_patch(key, dec.local_diff)] if dec.local_diff else [] + dec.remote_diff = [op_patch(key, dec.remote_diff)] if dec.remote_diff else [] if dec.action == "custom": - dec.custom_diff = [op_patch(key, dec.custom_diff)] + dec.custom_diff = [op_patch(key, dec.custom_diff)] if dec.custom_diff else [] return dec diff --git a/nbdime/merging/generic.py b/nbdime/merging/generic.py index ca1cc817..5b3bf8fb 100644 --- a/nbdime/merging/generic.py +++ b/nbdime/merging/generic.py @@ -167,6 +167,8 @@ def __unused__wrap_subconflicts(key, subconflicts): for conf in subconflicts: (path, ld, rd, strategy) = conf assert path[-1] == key + assert ld is not None + assert rd is not None path = path[:-1] ld = [op_patch(key, ld)] rd = [op_patch(key, rd)] @@ -202,6 +204,7 @@ def adjust_patch_level(target_path, common_path, diff): newdiff = [] for d in diff: nd = d + assert nd is not None for key in remainder_path: nd = op_patch(key, nd) newdiff.append(nd) @@ -216,6 +219,8 @@ def collect_diffs(path, decisions): rd = adjust_patch_level(path, d.common_path, d.remote_diff) local_diff.extend(ld) remote_diff.extend(rd) + local_diff = combine_patches(local_diff) + remote_diff = combine_patches(remote_diff) return local_diff, remote_diff From 6b2e4d5607c4b7a18c328833197f4ee04d405b5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Sandve=20Aln=C3=A6s?= Date: Thu, 19 Jan 2017 16:08:09 +0000 Subject: [PATCH 08/10] Remove old fixme comments. --- nbdime/merging/generic.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/nbdime/merging/generic.py b/nbdime/merging/generic.py index 5b3bf8fb..c65584d5 100644 --- a/nbdime/merging/generic.py +++ b/nbdime/merging/generic.py @@ -25,16 +25,6 @@ from nbdime.merging.decisions import pop_all_patch_decisions, push_patch_decision -# FIXME XXX: Main todos left: -# - Rewrite resolve_* to work directly on decisions object, -# trying to resolve any conflicted decisions found -# - _merge_strings is probably broken, test and fix -# - extend test_merge_notebooks_inline with more tests -# - make existing test suite pass, either by modifying tests or fixing remaining bugs -# - test that this still works with mergetool (it shouldn't need any changes in principle) -# - remove autoresolve.py after checking for anything more we need from there - - # ============================================================================= # # Utility code follows From 028f0e13b2f7ae8015bd87073051dbb806410429 Mon Sep 17 00:00:00 2001 From: vidartf Date: Thu, 16 Feb 2017 12:39:03 +0100 Subject: [PATCH 09/10] Improve drag workaround --- nbdime-web/src/common/dragpanel.ts | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/nbdime-web/src/common/dragpanel.ts b/nbdime-web/src/common/dragpanel.ts index f4e879a7..89fc5658 100644 --- a/nbdime-web/src/common/dragpanel.ts +++ b/nbdime-web/src/common/dragpanel.ts @@ -31,25 +31,6 @@ import { } from 'phosphor/lib/dom/dragdrop'; -//TODO: Remove when fixed: -/** - * Workaround for phosphor bug #165. - */ -class WorkaroundDrag extends Drag { -} - -function _attachDragImage(clientX: number, clientY: number): void { - (Drag as any).prototype._attachDragImage.call(this, clientX + window.pageXOffset, clientY + window.pageYOffset); -} -function _moveDragImage(clientX: number, clientY: number): void { - (Drag as any).prototype._moveDragImage.call(this, clientX + window.pageXOffset, clientY + window.pageYOffset); -} - -// monkeypatch: -(WorkaroundDrag.prototype as any)._attachDragImage = _attachDragImage; -(WorkaroundDrag.prototype as any)._moveDragImage = _moveDragImage; - - /** * The class name added to the DropPanel */ @@ -464,7 +445,7 @@ abstract class DragDropPanelBase extends DropPanel { let dragImage = this.getDragImage(handle); // Set up the drag event. - this.drag = new WorkaroundDrag({ + this.drag = new Drag({ dragImage: dragImage || undefined, mimeData: new MimeData(), supportedActions: 'all', @@ -475,6 +456,12 @@ abstract class DragDropPanelBase extends DropPanel { // Start the drag and remove the mousemove listener. this.drag.start(clientX, clientY).then(this.onDragComplete.bind(this)); + //TODO: Remove when updating to phosphor 1.0: + // Workaround for phosphor bug #165: + if (dragImage) { + dragImage.style.position = 'fixed'; + } + // End workaround document.removeEventListener('mousemove', this, true); document.removeEventListener('mouseup', this, true); } From 4880e526c260fffd8cb98c004cc56d7981ce3ef7 Mon Sep 17 00:00:00 2001 From: vidartf Date: Thu, 16 Feb 2017 14:13:10 +0100 Subject: [PATCH 10/10] Use `re.IGNORECASE` for diffing regexps --- nbdime/diffing/notebooks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py index 3a80d4cf..829ba42a 100644 --- a/nbdime/diffing/notebooks.py +++ b/nbdime/diffing/notebooks.py @@ -27,13 +27,13 @@ __all__ = ["diff_notebooks"] # A regexp matching base64 encoded data -_base64 = re.compile(r'^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$', re.MULTILINE | re.UNICODE) +_base64 = re.compile(r'^(?:[a-z0-9+/]{4})*(?:[a-z0-9+/]{2}==|[a-z0-9+/]{3}=)?$', re.MULTILINE | re.UNICODE | re.IGNORECASE) # A regexp matching common python repr-style output like # -re_repr = re.compile(r"<[a-zA-Z0-9._]+ at 0[xX][a-fA-F0-9]{8,16}>") +re_repr = re.compile(r"<[a-z0-9._]+ at 0x[a-f0-9]{8,16}>", re.IGNORECASE) -re_pointer = re.compile(r"0[xX][a-fA-F0-9]{8,16}") +re_pointer = re.compile(r"0x[a-f0-9]{8,16}", re.IGNORECASE) # List of mimes we can diff recursively