fixes #426, adding docs for

group_by
seperman · Oct 17, 2023 · 84fcc41 · 84fcc41
1 parent ade098a
commit 84fcc41
Showing 9 changed files with 216 additions and 38 deletions.
diff --git a/README.md b/README.md
@@ -23,9 +23,15 @@ Tested on Python 3.7+ and PyPy3.
 
 Please check the [ChangeLog](CHANGELOG.md) file for the detailed information.
 
+DeepDiff 6-6-1
+- Fix for [DeepDiff raises decimal exception when using significant digits](https://github.com/seperman/deepdiff/issues/426)
+- Introducing group_by_sort_key
+- Adding group_by 2D. For example `group_by=['last_name', 'zip_code']`
+
+
 DeepDiff 6-6-0
 
-- [Serialize To Flat Dicts]()
+- [Serialize To Flat Dicts](https://zepworks.com/deepdiff/current/serialization.html#delta-to-flat-dicts-label)
 - [NumPy 2.0 compatibility](https://github.com/seperman/deepdiff/pull/422) by [William Jamieson](https://github.com/WilliamJamieson)
 
 DeepDiff 6-5-0

diff --git a/deepdiff/diff.py b/deepdiff/diff.py
@@ -1601,35 +1601,64 @@ def _get_view_results(self, view):
             raise ValueError(INVALID_VIEW_MSG.format(view))
         return result
 
+    @staticmethod
+    def _get_key_for_group_by(row, group_by, item_name):
+        try:
+            return row.pop(group_by)
+        except KeyError:
+            logger.error("Unable to group {} by {}. The key is missing in {}".format(item_name, group_by, row))
+            raise
+
     def _group_iterable_to_dict(self, item, group_by, item_name):
         """
         Convert a list of dictionaries into a dictionary of dictionaries
         where the key is the value of the group_by key in each dictionary.
         """
+        group_by_level2 = None
+        if isinstance(group_by, (list, tuple)):
+            group_by_level1 = group_by[0]
+            if len(group_by) > 1:
+                group_by_level2 = group_by[1]
+        else:
+            group_by_level1 = group_by
         if isinstance(item, Iterable) and not isinstance(item, Mapping):
             result = {}
             item_copy = deepcopy(item)
             for row in item_copy:
                 if isinstance(row, Mapping):
-                    try:
-                        key = row.pop(group_by)
-                    except KeyError:
-                        logger.error("Unable to group {} by {}. The key is missing in {}".format(item_name, group_by, row))
-                        raise
-                    if self.group_by_sort_key:
-                        if key not in result:
-                            result[key] = []
-                        if row not in result[key]:
-                            result[key].append(row)
+                    key1 = self._get_key_for_group_by(row, group_by_level1, item_name)
+                    if group_by_level2:
+                        key2 = self._get_key_for_group_by(row, group_by_level2, item_name)
+                        if key1 not in result:
+                            result[key1] = {}
+                        if self.group_by_sort_key:
+                            if key2 not in result[key1]:
+                                result[key1][key2] = []
+                            result_key1_key2 = result[key1][key2]
+                            if row not in result_key1_key2:
+                                result_key1_key2.append(row)
+                        else:
+                            result[key1][key2] = row
                     else:
-                        result[key] = row
+                        if self.group_by_sort_key:
+                            if key1 not in result:
+                                result[key1] = []
+                            if row not in result[key1]:
+                                result[key1].append(row)
+                        else:
+                            result[key1] = row
                 else:
-                    msg = "Unable to group {} by {} since the item {} is not a dictionary.".format(item_name, group_by, row)
+                    msg = "Unable to group {} by {} since the item {} is not a dictionary.".format(item_name, group_by_level1, row)
                     logger.error(msg)
                     raise ValueError(msg)
             if self.group_by_sort_key:
-                for key, row in result.items():
-                    row.sort(key=self.group_by_sort_key)
+                if group_by_level2:
+                    for key1, row1 in result.items():
+                        for key2, row in row1.items():
+                            row.sort(key=self.group_by_sort_key)
+                else:
+                    for key, row in result.items():
+                        row.sort(key=self.group_by_sort_key)
             return result
         msg = "Unable to group {} by {}".format(item_name, group_by)
         logger.error(msg)

diff --git a/deepdiff/helper.py b/deepdiff/helper.py
@@ -8,7 +8,7 @@
 import string
 import time
 from ast import literal_eval
-from decimal import Decimal, localcontext
+from decimal import Decimal, localcontext, InvalidOperation as InvalidDecimalOperation
 from collections import namedtuple
 from itertools import repeat
 from ordered_set import OrderedSet
@@ -394,7 +394,13 @@ def number_to_string(number, significant_digits, number_format_notation="f"):
             # Precision = number of integer digits + significant_digits
             # Using number//1 to get the integer part of the number
             ctx.prec = len(str(abs(number // 1))) + significant_digits
-            number = number.quantize(Decimal('0.' + '0' * significant_digits))
+            try:
+                number = number.quantize(Decimal('0.' + '0' * significant_digits))
+            except InvalidDecimalOperation:
+                # Sometimes rounding up causes a higher precision to be needed for the quantize operation
+                # For example '999.99999999' will become '1000.000000' after quantize
+                ctx.prec += 1
+                number = number.quantize(Decimal('0.' + '0' * significant_digits))
     elif isinstance(number, only_complex_number):
         # Case for complex numbers.
         number = number.__class__(

diff --git a/deepdiff/serialization.py b/deepdiff/serialization.py
@@ -537,6 +537,7 @@ def _serialize_decimal(value):
 JSON_CONVERTOR = {
     decimal.Decimal: _serialize_decimal,
     ordered_set.OrderedSet: list,
+    set: list,
     type: lambda x: x.__name__,
     bytes: lambda x: x.decode('utf-8'),
     datetime.datetime: lambda x: x.isoformat(),

diff --git a/docs/basics.rst b/docs/basics.rst
@@ -148,9 +148,24 @@ Object attribute added:
 Group By
 --------
 
-group_by can be used when dealing with list of dictionaries to convert them to group them by value defined in group_by. The common use case is when reading data from a flat CSV and primary key is one of the columns in the CSV. We want to use the primary key to group the rows instead of CSV row number.
+group_by can be used when dealing with the list of dictionaries. It converts them from lists to a single dictionary with the key defined by group_by. The common use case is when reading data from a flat CSV, and the primary key is one of the columns in the CSV. We want to use the primary key instead of the CSV row number to group the rows. The group_by can do 2D group_by by passing a list of 2 keys.
 
-Example:
+For example:
+    >>> [
+    ...     {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody'},
+    ...     {'id': 'BB', 'name': 'James', 'last_name': 'Blue'},
+    ...     {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple'},
+    ... ]
+
+Becomes:
+    >>> t1 = {
+    ...     'AA': {'name': 'Joe', 'last_name': 'Nobody'},
+    ...     'BB': {'name': 'James', 'last_name': 'Blue'},
+    ...     'CC': {'name': 'Mike', 'last_name': 'Apple'},
+    ... }
+
+
+With that in mind, let's take a look at the following:
     >>> from deepdiff import DeepDiff
     >>> t1 = [
     ...     {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody'},
@@ -187,5 +202,75 @@ Now we use group_by='id':
     >>> diff['values_changed'][0].up.up.t1
     {'AA': {'name': 'Joe', 'last_name': 'Nobody'}, 'BB': {'name': 'James', 'last_name': 'Blue'}, 'CC': {'name': 'Mike', 'last_name': 'Apple'}}
 
+2D Example:
+    >>> from pprint import pprint
+    >>> from deepdiff import DeepDiff
+    >>>
+    >>> t1 = [
+    ...     {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody'},
+    ...     {'id': 'BB', 'name': 'James', 'last_name': 'Blue'},
+    ...     {'id': 'BB', 'name': 'Jimmy', 'last_name': 'Red'},
+    ...     {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple'},
+    ... ]
+    >>>
+    >>> t2 = [
+    ...     {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody'},
+    ...     {'id': 'BB', 'name': 'James', 'last_name': 'Brown'},
+    ...     {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple'},
+    ... ]
+    >>>
+    >>> diff = DeepDiff(t1, t2, group_by=['id', 'name'])
+    >>> pprint(diff)
+    {'dictionary_item_removed': [root['BB']['Jimmy']],
+     'values_changed': {"root['BB']['James']['last_name']": {'new_value': 'Brown',
+                                                             'old_value': 'Blue'}}}
+
+.. _group_by_sort_key_label:
+
+Group By - Sort Key
+-------------------
+
+group_by_sort_key is used to define how dictionaries are sorted if multiple ones fall under one group. When this parameter is used, group_by converts the lists of dictionaries into a dictionary of keys to lists of dictionaries. Then, group_by_sort_key is used to sort between the list.
+
+For example, there are duplicate id values. If we only use group_by='id', one of the dictionaries with id of 'BB' will overwrite the other. However, if we also set group_by_sort_key='name', we keep both dictionaries with the id of 'BB'. 
+
+Example:
+
+    [{'id': 'AA', 'int_id': 2, 'last_name': 'Nobody', 'name': 'Joe'},
+     {'id': 'BB', 'int_id': 20, 'last_name': 'Blue', 'name': 'James'},
+     {'id': 'BB', 'int_id': 3, 'last_name': 'Red', 'name': 'Jimmy'},
+     {'id': 'CC', 'int_id': 4, 'last_name': 'Apple', 'name': 'Mike'}]
+
+
+Becomes:
+    {'AA': [{'int_id': 2, 'last_name': 'Nobody', 'name': 'Joe'}],
+     'BB': [{'int_id': 20, 'last_name': 'Blue', 'name': 'James'},
+            {'int_id': 3, 'last_name': 'Red', 'name': 'Jimmy'}],
+     'CC': [{'int_id': 4, 'last_name': 'Apple', 'name': 'Mike'}]}
+
+
+Example of using group_by_sort_key
+    >>> t1 = [
+    ...     {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody', 'int_id': 2},
+    ...     {'id': 'BB', 'name': 'James', 'last_name': 'Blue', 'int_id': 20},
+    ...     {'id': 'BB', 'name': 'Jimmy', 'last_name': 'Red', 'int_id': 3},
+    ...     {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple', 'int_id': 4},
+    ... ]
+    >>>
+    >>> t2 = [
+    ...     {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody', 'int_id': 2},
+    ...     {'id': 'BB', 'name': 'James', 'last_name': 'Brown', 'int_id': 20},
+    ...     {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple', 'int_id': 4},
+    ... ]
+    >>>
+    >>> diff = DeepDiff(t1, t2, group_by='id', group_by_sort_key='name')
+    >>>
+    >>> pprint(diff)
+    {'iterable_item_removed': {"root['BB'][1]": {'int_id': 3,
+                                                 'last_name': 'Red',
+                                                 'name': 'Jimmy'}},
+     'values_changed': {"root['BB'][0]['last_name']": {'new_value': 'Brown',
+                                                       'old_value': 'Blue'}}}
+
 
 Back to :doc:`/index`
diff --git a/docs/diff_doc.rst b/docs/diff_doc.rst
@@ -79,8 +79,11 @@ include_obj_callback_strict: function, default = None
 get_deep_distance: Boolean, default = False
     :ref:`get_deep_distance_label` will get you the deep distance between objects. The distance is a number between 0 and 1 where zero means there is no diff between the 2 objects and 1 means they are very different. Note that this number should only be used to compare the similarity of 2 objects and nothing more. The algorithm for calculating this number may or may not change in the future releases of DeepDiff.
 
-group_by: String, default=None
-    :ref:`group_by_label` can be used when dealing with list of dictionaries to convert them to group them by value defined in group_by. The common use case is when reading data from a flat CSV and primary key is one of the columns in the CSV. We want to use the primary key to group the rows instead of CSV row number.
+group_by: String or a list of size 2, default=None
+    :ref:`group_by_label` can be used when dealing with the list of dictionaries. It converts them from lists to a single dictionary with the key defined by group_by. The common use case is when reading data from a flat CSV, and the primary key is one of the columns in the CSV. We want to use the primary key instead of the CSV row number to group the rows. The group_by can do 2D group_by by passing a list of 2 keys.
+
+group_by_sort_key: String or a function
+    :ref:`group_by_sort_key_label` is used to define how dictionaries are sorted if multiple ones fall under one group. When this parameter is used, group_by converts the lists of dictionaries into a dictionary of keys to lists of dictionaries. Then, :ref:`group_by_sort_key_label` is used to sort between the list.
 
 hasher: default = DeepHash.sha256hex
     Hash function to be used. If you don't want SHA256, you can use your own hash function

diff --git a/docs/index.rst b/docs/index.rst
@@ -31,6 +31,15 @@ The DeepDiff library includes the following modules:
 What Is New
 ***********
 
+DeepDiff 6-6-1
+--------------
+
+    -  Fix for `DeepDiff raises decimal exception when using significant
+       digits <https://github.com/seperman/deepdiff/issues/426>`__
+    -  Introducing group_by_sort_key
+    -  Adding group_by 2D. For example
+       ``group_by=['last_name', 'zip_code']``
+
 DeepDiff 6-6-0
 --------------
 

diff --git a/tests/test_diff_text.py b/tests/test_diff_text.py
@@ -1144,18 +1144,19 @@ def test_int_to_unicode(self):
         }
         assert result == ddiff
 
-    @pytest.mark.parametrize("t1, t2, ignore_numeric_type_changes, significant_digits, number_format_notation, result", [
-        (43.265798602382986, 43.71677762295505, False, 0, "f", {'values_changed': {'root': {'new_value': 43.71677762295505, 'old_value': 43.265798602382986}}}),  # Note that it rounds the number so one becomes 43 and the other one is 44
-        (Decimal('2.5'), Decimal('1.5'), False, 0, "f", {}),
-        (Decimal('2.5'), Decimal('1.5'), False, 1, "f", {'values_changed': {'root': {'new_value': Decimal('1.5'), 'old_value': Decimal('2.5')}}}),
-        (Decimal('2.5'), Decimal(2.5), False, 3, "f", {}),
-        (1024, 1022, False, 2, "e", {}),
-        ({"key": [Decimal('2.0001'), Decimal('20000.0001')]}, {"key": [2.0002, 20000.0002]}, True, 4, "e", {'values_changed': {"root['key'][0]": {'new_value': 2.0002, 'old_value': Decimal('2.0001')}}})
+    @pytest.mark.parametrize("test_num, t1, t2, ignore_numeric_type_changes, significant_digits, number_format_notation, result", [
+        (1, 43.265798602382986, 43.71677762295505, False, 0, "f", {'values_changed': {'root': {'new_value': 43.71677762295505, 'old_value': 43.265798602382986}}}),  # Note that it rounds the number so one becomes 43 and the other one is 44
+        (2, Decimal('2.5'), Decimal('1.5'), False, 0, "f", {}),
+        (3, Decimal('2.5'), Decimal('1.5'), False, 1, "f", {'values_changed': {'root': {'new_value': Decimal('1.5'), 'old_value': Decimal('2.5')}}}),
+        (4, Decimal('2.5'), Decimal(2.5), False, 3, "f", {}),
+        (5, 1024, 1022, False, 2, "e", {}),
+        (6, {"key": [Decimal('2.0001'), Decimal('20000.0001')]}, {"key": [2.0002, 20000.0002]}, True, 4, "e", {'values_changed': {"root['key'][0]": {'new_value': 2.0002, 'old_value': Decimal('2.0001')}}}),
+        (7, [Decimal("999.99999999")], [Decimal("999.9999999999")], False, 6, "f", {}),
     ])
-    def test_significant_digits_and_notation(self, t1, t2, ignore_numeric_type_changes, significant_digits, number_format_notation, result):
+    def test_significant_digits_and_notation(self, test_num, t1, t2, ignore_numeric_type_changes, significant_digits, number_format_notation, result):
         ddiff = DeepDiff(t1, t2, significant_digits=significant_digits, number_format_notation=number_format_notation,
                          ignore_numeric_type_changes=ignore_numeric_type_changes)
-        assert result == ddiff
+        assert result == ddiff, f"test_significant_digits_and_notation #{test_num} failed."
 
     def test_significant_digits_for_complex_imaginary_part(self):
         t1 = 1.23 + 1.222254j
@@ -1745,8 +1746,43 @@ def test_group_by2_when_repeats(self):
         diff2 = DeepDiff(t1, t2, group_by='id', group_by_sort_key=lambda x: x['name'])
         assert expected_grouped == diff2
 
-        diff3 = DeepDiff(t1, t2, group_by='id', group_by_sort_key=lambda x: x['name'])
-        assert expected_grouped == diff3
+    def test_group_by3_when_repeats_and_group_by_list(self):
+        t1 = [
+            {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody', 'int_id': 2},
+            {'id': 'BB', 'name': 'James', 'last_name': 'Blue', 'int_id': 20},
+            {'id': 'BB', 'name': 'Jimmy', 'last_name': 'Red', 'int_id': 3},
+            {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple', 'int_id': 4},
+        ]
+
+        t2 = [
+            {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody', 'int_id': 2},
+            {'id': 'BB', 'name': 'James', 'last_name': 'Brown', 'int_id': 20},
+            {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple', 'int_id': 4},
+        ]
+
+        diff1 = DeepDiff(t1, t2, group_by=['id', 'name'])
+        expected_grouped = {
+            'dictionary_item_removed': ["root['BB']['Jimmy']"],
+            'values_changed': {
+                "root['BB']['James']['last_name']": {
+                    'new_value': 'Brown',
+                    'old_value': 'Blue'
+                }
+            }
+        }
+        assert expected_grouped == diff1
+
+        diff2 = DeepDiff(t1, t2, group_by=['id', 'name'], group_by_sort_key='int_id')
+        expected_grouped = {
+            'dictionary_item_removed': ["root['BB']['Jimmy']"],
+            'values_changed': {
+                "root['BB']['James'][0]['last_name']": {
+                    'new_value': 'Brown',
+                    'old_value': 'Blue'
+                }
+            }
+        }
+        assert expected_grouped == diff2
 
     def test_group_by_key_missing(self):
         t1 = [

diff --git a/tests/test_serialization.py b/tests/test_serialization.py
@@ -315,13 +315,16 @@ def test_pretty_form_method(self, expected, verbose_level):
         result = ddiff.pretty()
         assert result == expected
 
-    @pytest.mark.parametrize('test_num, value', [
-        (1, {'10': None}),
-        (2, {"type_changes": {"root": {"old_type": None, "new_type": list, "new_value": ["你好", 2, 3, 5]}}}),
-        (3, {'10': Decimal(2017)}),
-        (4, Decimal(2017.1)),
+    @pytest.mark.parametrize('test_num, value, func_to_convert_back', [
+        (1, {'10': None}, None),
+        (2, {"type_changes": {"root": {"old_type": None, "new_type": list, "new_value": ["你好", 2, 3, 5]}}}, None),
+        (3, {'10': Decimal(2017)}, None),
+        (4, Decimal(2017.1), None),
+        (5, {1, 2, 10}, set),
     ])
-    def test_json_dumps_and_loads(self, test_num, value):
+    def test_json_dumps_and_loads(self, test_num, value, func_to_convert_back):
         serialized = json_dumps(value)
         back = json_loads(serialized)
+        if func_to_convert_back:
+            back = func_to_convert_back(back)
         assert value == back, f"test_json_dumps_and_loads test #{test_num} failed"