Merge pull request #257 from vidartf/fixes

Various fixes and minor adjustments
jupyter · Feb 16, 2017 · 352e8a5 · 352e8a5
2 parents 9e6f400 + 4880e52
commit 352e8a5
Show file tree

Hide file tree

Showing 9 changed files with 141 additions and 41 deletions.
diff --git a/nbdime-web/src/common/dragpanel.ts b/nbdime-web/src/common/dragpanel.ts
@@ -31,25 +31,6 @@ import {
 } from 'phosphor/lib/dom/dragdrop';
 
 
-//TODO: Remove when fixed:
-/**
- * Workaround for phosphor bug #165.
- */
-class WorkaroundDrag extends Drag {
-}
-
-function _attachDragImage(clientX: number, clientY: number): void {
-  (Drag as any).prototype._attachDragImage.call(this, clientX + window.pageXOffset, clientY + window.pageYOffset);
-}
-function _moveDragImage(clientX: number, clientY: number): void {
-  (Drag as any).prototype._moveDragImage.call(this, clientX + window.pageXOffset, clientY + window.pageYOffset);
-}
-
-// monkeypatch:
-(WorkaroundDrag.prototype as any)._attachDragImage = _attachDragImage;
-(WorkaroundDrag.prototype as any)._moveDragImage = _moveDragImage;
-
-
 /**
  * The class name added to the DropPanel
  */
@@ -464,7 +445,7 @@ abstract class DragDropPanelBase extends DropPanel {
     let dragImage = this.getDragImage(handle);
 
     // Set up the drag event.
-    this.drag = new WorkaroundDrag({
+    this.drag = new Drag({
       dragImage: dragImage || undefined,
       mimeData: new MimeData(),
       supportedActions: 'all',
@@ -475,6 +456,12 @@ abstract class DragDropPanelBase extends DropPanel {
 
     // Start the drag and remove the mousemove listener.
     this.drag.start(clientX, clientY).then(this.onDragComplete.bind(this));
+    //TODO: Remove when updating to phosphor 1.0:
+    // Workaround for phosphor bug #165:
+    if (dragImage) {
+      dragImage.style.position = 'fixed';
+    }
+    // End workaround
     document.removeEventListener('mousemove', this, true);
     document.removeEventListener('mouseup', this, true);
   }

diff --git a/nbdime/diff_format.py b/nbdime/diff_format.py
@@ -94,6 +94,7 @@ def op_removerange(key, length):
 
 def op_patch(key, diff):
     "Create a diff entry to patch value at key with diff."
+    assert diff is not None
     return DiffEntry(op=DiffOp.PATCH, key=key, diff=diff)
 
 

diff --git a/nbdime/diffing/generic.py b/nbdime/diffing/generic.py
@@ -38,8 +38,13 @@ def compare_strings_approximate(x, y, threshold=0.7):
     # TODO: Add configuration framework
     # TODO: Tune threshold with realistic sources
 
-    # Cutoff on equality (Python has fast hash functions for strings)
-    if x == y:
+    # Fast cutoff when one is empty
+    if bool(x) != bool(y):
+        return False
+
+    # Cutoff on equality: Python has fast hash functions for strings,
+    # and lists of strings also works fine
+    if len(x) == len(y) and x == y:
         return True
 
     # TODO: Investigate performance and quality of this difflib ratio approach,
@@ -55,9 +60,12 @@ def compare_strings_approximate(x, y, threshold=0.7):
     # equal items, at least in the Myers diff algorithm.
     # Most other comparisons will likely not be very similar,
     # and the (real_)quick_ratio cutoffs will speed up those.
+
     # So the heavy ratio function is only used for close calls.
     # s = difflib.SequenceMatcher(lambda c: c in (" ", "\t"), x, y, autojunk=False)
     s = difflib.SequenceMatcher(None, x, y, autojunk=False)
+
+    # Use only the fast ratio approximations first
     if s.real_quick_ratio() < threshold:
         return False
     if s.quick_ratio() < threshold:

diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py
@@ -27,13 +27,13 @@
 __all__ = ["diff_notebooks"]
 
 # A regexp matching base64 encoded data
-_base64 = re.compile(r'^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$', re.MULTILINE | re.UNICODE)
+_base64 = re.compile(r'^(?:[a-z0-9+/]{4})*(?:[a-z0-9+/]{2}==|[a-z0-9+/]{3}=)?$', re.MULTILINE | re.UNICODE | re.IGNORECASE)
 
 # A regexp matching common python repr-style output like
 # <module.type at 0xmemoryaddress>
-re_repr = re.compile(r"<[a-zA-Z0-9._]+ at 0x[a-zA-Z0-9]+>")
+re_repr = re.compile(r"<[a-z0-9._]+ at 0x[a-f0-9]{8,16}>", re.IGNORECASE)
 
-re_pointer = re.compile(r"0[xX][a-zA-Z0-9]{8,16}")
+re_pointer = re.compile(r"0x[a-f0-9]{8,16}", re.IGNORECASE)
 
 
 # List of mimes we can diff recursively
@@ -357,10 +357,12 @@ def diff_single_outputs(a, b, path="/cells/*/outputs/*",
     if a.output_type in ("execute_result", "display_data"):
         di = MappingDiffBuilder()
 
+        tmp_data = a.pop('data')
         a_conj = copy.deepcopy(a)
-        del a_conj['data']
+        a.data = tmp_data
+        tmp_data = b.pop('data')
         b_conj = copy.deepcopy(b)
-        del b_conj['data']
+        b.data = tmp_data
         dd_conj = diff(a_conj, b_conj)
         if dd_conj:
             for e in dd_conj:

diff --git a/nbdime/diffing/snakes.py b/nbdime/diffing/snakes.py
@@ -9,7 +9,6 @@
 Utilities for computing 'snakes', or contiguous sequences of equal elements of two sequences.
 """
 
-import operator
 from ..diff_format import SequenceDiffBuilder
 from .seq_bruteforce import bruteforce_compute_snakes
 

diff --git a/nbdime/merging/decisions.py b/nbdime/merging/decisions.py
@@ -413,10 +413,10 @@ def push_patch_decision(decision, prefix):
         assert dec.common_path[-1] == key, "Key %s not at end of %s" % (
             key, dec.common_path)
         dec.common_path = dec.common_path[:-1]  # pop key
-        dec.local_diff = [op_patch(key, dec.local_diff)]
-        dec.remote_diff = [op_patch(key, dec.remote_diff)]
+        dec.local_diff = [op_patch(key, dec.local_diff)] if dec.local_diff else []
+        dec.remote_diff = [op_patch(key, dec.remote_diff)] if dec.remote_diff else []
         if dec.action == "custom":
-            dec.custom_diff = [op_patch(key, dec.custom_diff)]
+            dec.custom_diff = [op_patch(key, dec.custom_diff)] if dec.custom_diff else []
     return dec
 
 

diff --git a/nbdime/merging/generic.py b/nbdime/merging/generic.py
@@ -25,16 +25,6 @@
 from nbdime.merging.decisions import pop_all_patch_decisions, push_patch_decision
 
 
-# FIXME XXX: Main todos left:
-# - Rewrite resolve_* to work directly on decisions object,
-#   trying to resolve any conflicted decisions found
-# - _merge_strings is probably broken, test and fix 
-# - extend test_merge_notebooks_inline with more tests
-# - make existing test suite pass, either by modifying tests or fixing remaining bugs
-# - test that this still works with mergetool (it shouldn't need any changes in principle)
-# - remove autoresolve.py after checking for anything more we need from there
-
-
 # =============================================================================
 #
 # Utility code follows
@@ -167,6 +157,8 @@ def __unused__wrap_subconflicts(key, subconflicts):
     for conf in subconflicts:
         (path, ld, rd, strategy) = conf
         assert path[-1] == key
+        assert ld is not None
+        assert rd is not None
         path = path[:-1]
         ld = [op_patch(key, ld)]
         rd = [op_patch(key, rd)]
@@ -202,6 +194,7 @@ def adjust_patch_level(target_path, common_path, diff):
     newdiff = []
     for d in diff:
         nd = d
+        assert nd is not None
         for key in remainder_path:
             nd = op_patch(key, nd)
         newdiff.append(nd)
@@ -216,6 +209,8 @@ def collect_diffs(path, decisions):
         rd = adjust_patch_level(path, d.common_path, d.remote_diff)
         local_diff.extend(ld)
         remote_diff.extend(rd)
+    local_diff = combine_patches(local_diff)
+    remote_diff = combine_patches(remote_diff)
     return local_diff, remote_diff
 
 

diff --git a/nbdime/profiling.py b/nbdime/profiling.py
@@ -0,0 +1,107 @@
+
+"""Tools for profiling diff/merge performance.
+
+Sometimes diffing times can become unacceptably long.
+Then the tools in this module can help you profile and figure out
+where the time is spent. For a more rigorous profiling, consider using
+cPython, but for initial considerations these should be helpful.
+
+Typical profiling usage:
+Add some statements like
+from nbdime.profiling import timer
+with timer.time('Key to identify this segment'):
+    <code to time>
+
+Then, launch `python -m nbdime.profiling notebookA.ipynb notebookB.ipynb`.
+
+Example case:
+Output diffing is slow. By cPython/timer, the time spent has been narrowed
+to comparison of MIME types (`_compare_mimedata()`). To figure out which
+MIME type is taking all the time, the following is added to that function:
+
+    with timer.time(mimetype):
+        <existing function body>
+
+The output becomes:
+
+    Key           Calls          Time
+    ----------  -------  ------------
+    text/html         6  74.9686
+    text/plain       12   0.000500202
+    image/png         5   0
+
+From this we can see that the HTML comparison is really blowing up!
+Further inspection reveals that there are two html outputs in each
+notebook that contain an embedded base64 image. These caused the text
+diff to blow up.
+
+"""
+
+import time
+import contextlib
+from tabulate import tabulate
+
+
+def _sort_time(value):
+    time = value[1]['time']
+    return -time
+
+
+class TimePaths(object):
+    def __init__(self, verbose=False, enabled=True):
+        self.verbose = verbose
+        self.map = {}
+        self.enabled = enabled
+
+    @contextlib.contextmanager
+    def time(self, key):
+        start = time.time()
+        yield
+        end = time.time()
+        secs = end - start
+        if not self.enabled:
+            return
+        if key in self.map:
+            self.map[key]['time'] += secs
+            self.map[key]['calls'] += 1
+        else:
+            self.map[key] = dict(time=secs, calls=1)
+
+    @contextlib.contextmanager
+    def enable(self):
+        old = self.enabled
+        self.enabled = True
+        yield
+        self.enabled = old
+
+    @contextlib.contextmanager
+    def disable(self):
+        old = self.enabled
+        self.enabled = False
+        yield
+        self.enabled = old
+
+    def __str__(self):
+        # First, sort by path
+        items = sorted(self.map.items(), key=_sort_time)
+        lines = []
+        for key, data in items:
+            time = data['time']
+            calls = data['calls']
+            lines.append((key, calls, time))
+        return tabulate(lines, headers=['Key', 'Calls', 'Time'])
+
+
+timer = TimePaths(enabled=False)
+
+
+def profile_diff_paths(args=None):
+    import nbdime.nbdiffapp
+    import nbdime.profiling
+    nbdime.nbdiffapp.main(args)
+    data = str(nbdime.profiling.timer)
+    print(data)
+
+
+if __name__ == "__main__":
+    profile_diff_paths()
diff --git a/setup.py b/setup.py
@@ -245,6 +245,7 @@ def run(self):
         'jsonschema',
         'mock',
         'requests',
+        'tabulate',  # For profiling
     ],
     'docs': [
         'sphinx',