From 623c4482c001657d3c2b3945712bc1d45525e7ce Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Sat, 10 May 2014 09:17:24 -0400 Subject: [PATCH 1/6] Faster groupby! Issue #178 impressed upon me just how costly attribute resolution can be. In this case, `groupby` was made faster by avoiding resolving the attribute `list.append`. This implementation is also more memory efficient than the current version that uses a `defaultdict` that gets cast to a `dict`. While casting a defaultdict `d` to a dict as `dict(d)` is fast, it is still a fast *copy*. Honorable mention goes to the following implementation: ```python def groupby_alt(func, seq): d = collections.defaultdict(lambda: [].append) for item in seq: d[func(item)](item) rv = {} for k, v in iteritems(d): rv[k] = v.__self__ return rv ``` This alternative implementation can at times be *very* impressive. You should play with it! --- toolz/itertoolz.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index c187193f..073aa440 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -3,7 +3,8 @@ import collections import operator from functools import partial -from toolz.compatibility import map, filter, filterfalse, zip, zip_longest +from toolz.compatibility import (map, filter, filterfalse, zip, zip_longest, + iteritems) __all__ = ('remove', 'accumulate', 'groupby', 'merge_sorted', 'interleave', @@ -66,12 +67,19 @@ def groupby(func, seq): {False: [1, 3, 5, 7], True: [2, 4, 6, 8]} See Also: - ``countby`` + countby """ - d = collections.defaultdict(list) + d = {} for item in seq: - d[func(item)].append(item) - return dict(d) + key = func(item) + if key not in d: + d[key] = [item].append + else: + d[key](item) + # This is okay to do, because we are not adding or removing keys + for k, v in iteritems(d): + d[k] = v.__self__ + return d def merge_sorted(*seqs, **kwargs): From 76054c8fb448f066699229b8e51ea74a8b3469ea Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Sat, 10 May 2014 12:59:32 -0400 Subject: [PATCH 2/6] Add useful code comments to groupby including pythonic implementation --- toolz/itertoolz.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index 073aa440..3e11d831 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -69,16 +69,25 @@ def groupby(func, seq): See Also: countby """ + # The commented code below shows what is probably the fastest + # "pythonic" implementation of `groupby`: + # + # d = collections.defaultdict(list) + # for item in seq: + # d[func(item)].append(item) + # return dict(d) + d = {} for item in seq: key = func(item) if key not in d: - d[key] = [item].append + d[key] = [item].append # avoid method resolution overhead else: - d[key](item) - # This is okay to do, because we are not adding or removing keys - for k, v in iteritems(d): - d[k] = v.__self__ + d[key](item) # append item to list + + for key, appendval in iteritems(d): + # `mylist.append.__self__` references original list instance `mylist` + d[key] = appendval.__self__ return d From 8365b4db655a879b41a650b39c1e2b0ffeeb8810 Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Sat, 10 May 2014 18:45:32 -0400 Subject: [PATCH 3/6] Small tweak to `groupby` to improve it for "real-world" data Benchmarks have a tendency to use data that is pathologically perfect. In such pathological cases, it was unclear whether the current or previous implementation would be preferred. However, when the input data gets shuffled, the implementation in this commit is *clearly* superior. Therefore, I believe this is the best implementation for *real* data. --- toolz/itertoolz.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index 3e11d831..171719ea 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -80,10 +80,10 @@ def groupby(func, seq): d = {} for item in seq: key = func(item) - if key not in d: - d[key] = [item].append # avoid method resolution overhead - else: + if key in d: d[key](item) # append item to list + else: + d[key] = [item].append # avoid method resolution overhead for key, appendval in iteritems(d): # `mylist.append.__self__` references original list instance `mylist` From 6d330339467126185af31b5c6d319580ef5ee8c0 Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Sun, 11 May 2014 23:10:00 -0400 Subject: [PATCH 4/6] Add a new implementation of `groupby` that uses two dicts. The previous commit was wrong. It is slower than the commit before. There was a mistake in the code used for benchmarks. I believe the new implementation has optimal performance as groups get larger. It is also fast when creating a new group. It is slowest when each group has two or three items in it, but it is still fast enough so as to not impact the general performance of the algorithm. A note on size: using a second dict in the implementation doesn't add much memory. Let us consider the size used by all of the containers--dicts and lists--but not their contents. For the worst case scenario in which both dicts have two items (note that `fastdict` only has groups of length two or greater), memory usage is only increased by about 25% by having a second dict. --- toolz/itertoolz.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index 171719ea..5d23c088 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -77,18 +77,22 @@ def groupby(func, seq): # d[func(item)].append(item) # return dict(d) - d = {} + rv = {} + fastdict = {} for item in seq: key = func(item) - if key in d: - d[key](item) # append item to list + if key in fastdict: + # Optimal asymptotic performance + fastdict[key](item) # append item to list + elif key not in rv: + # Fast initialization of groups + rv[key] = [item] else: - d[key] = [item].append # avoid method resolution overhead - - for key, appendval in iteritems(d): - # `mylist.append.__self__` references original list instance `mylist` - d[key] = appendval.__self__ - return d + # Add `list.append` to `fastdict` to avoid attribute resolution + # overhead. This improves asymptotic speed as groups get larger. + val = fastdict[key] = rv[key].append + val(item) + return rv def merge_sorted(*seqs, **kwargs): From d871c9431ce418ffd2cd9920d127f198b47de63a Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Mon, 12 May 2014 12:48:36 -0400 Subject: [PATCH 5/6] No need to import `compatibility.iteritems` in `itertoolz` --- toolz/itertoolz.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index 5d23c088..6bcf8810 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -3,8 +3,7 @@ import collections import operator from functools import partial -from toolz.compatibility import (map, filter, filterfalse, zip, zip_longest, - iteritems) +from toolz.compatibility import map, filter, filterfalse, zip, zip_longest __all__ = ('remove', 'accumulate', 'groupby', 'merge_sorted', 'interleave', From 42dc568e4141c32c34c8c5a926f2b119928c3add Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Thu, 15 May 2014 12:53:53 -0400 Subject: [PATCH 6/6] Choose a version of `groupby` that is easier to understand. This is `groupby_alt` from the original commit comment in this branch. See discussion at #179. This version performs very well as groups become larger. The previous implementation performs well when 10% or more of the elements form groups of length one. We plan to have various implementations in `benchmarkz` repository, which will let contributors add benchmarks that they care about and easily run them on all variants of `groupby`. --- toolz/itertoolz.py | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/toolz/itertoolz.py b/toolz/itertoolz.py index 6bcf8810..f380213b 100644 --- a/toolz/itertoolz.py +++ b/toolz/itertoolz.py @@ -3,7 +3,8 @@ import collections import operator from functools import partial -from toolz.compatibility import map, filter, filterfalse, zip, zip_longest +from toolz.compatibility import (map, filter, filterfalse, zip, zip_longest, + iteritems) __all__ = ('remove', 'accumulate', 'groupby', 'merge_sorted', 'interleave', @@ -68,29 +69,12 @@ def groupby(func, seq): See Also: countby """ - # The commented code below shows what is probably the fastest - # "pythonic" implementation of `groupby`: - # - # d = collections.defaultdict(list) - # for item in seq: - # d[func(item)].append(item) - # return dict(d) - - rv = {} - fastdict = {} + d = collections.defaultdict(lambda: [].append) for item in seq: - key = func(item) - if key in fastdict: - # Optimal asymptotic performance - fastdict[key](item) # append item to list - elif key not in rv: - # Fast initialization of groups - rv[key] = [item] - else: - # Add `list.append` to `fastdict` to avoid attribute resolution - # overhead. This improves asymptotic speed as groups get larger. - val = fastdict[key] = rv[key].append - val(item) + d[func(item)](item) + rv = {} + for k, v in iteritems(d): + rv[k] = v.__self__ return rv