Faster groupby!

Issue pytoolz#178 impressed upon me just how costly attribute resolution can be. In this case, `groupby` was made faster by avoiding resolving the attribute `list.append`. This implementation is also more memory efficient than the current version that uses a `defaultdict` that gets cast to a `dict`. While casting a defaultdict `d` to a dict as `dict(d)` is fast, it is still a fast *copy*. Honorable mention goes to the following implementation: ```python def groupby_alt(func, seq): d = collections.defaultdict(lambda: [].append) for item in seq: d[func(item)](item) rv = {} for k, v in iteritems(d): rv[k] = v.__self__ return rv ``` This alternative implementation can at times be *very* impressive. You should play with it!
eriknw · May 10, 2014 · 623c448 · 623c448
1 parent e810547
commit 623c448
Showing 1 changed file with 13 additions and 5 deletions.
diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py
@@ -3,7 +3,8 @@
 import collections
 import operator
 from functools import partial
-from toolz.compatibility import map, filter, filterfalse, zip, zip_longest
+from toolz.compatibility import (map, filter, filterfalse, zip, zip_longest,
+                                 iteritems)
 
 
 __all__ = ('remove', 'accumulate', 'groupby', 'merge_sorted', 'interleave',
@@ -66,12 +67,19 @@ def groupby(func, seq):
     {False: [1, 3, 5, 7], True: [2, 4, 6, 8]}
 
     See Also:
-        ``countby``
+        countby
     """
-    d = collections.defaultdict(list)
+    d = {}
     for item in seq:
-        d[func(item)].append(item)
-    return dict(d)
+        key = func(item)
+        if key not in d:
+            d[key] = [item].append
+        else:
+            d[key](item)
+    # This is okay to do, because we are not adding or removing keys
+    for k, v in iteritems(d):
+        d[k] = v.__self__
+    return d
 
 
 def merge_sorted(*seqs, **kwargs):