Improve representation of MultiIndex(#6992)

Fixes: #6936 This PR introduces changes to `MultiIndex.__repr__`, where the output is now more readable and easy to understand similar to that of pandas MultiIndex. Changes also include handling of `<NA>`, `nan` values and spacing issues around them. Authors: - galipremsagar <sagarprem75@gmail.com> Approvers: - null - Keith Kraus URL: #6992
rapidsai · Dec 21, 2020 · 7556e23 · 7556e23
1 parent 923cf49
commit 7556e23
Show file tree

Hide file tree

Showing 6 changed files with 513 additions and 298 deletions.
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -2305,15 +2305,9 @@ def partition(self, sep=" ", expand=True):
         Which will create a MultiIndex:
 
         >>> idx.str.partition()
-        MultiIndex(levels=[0    X
-        1    Y
-        dtype: object, 0
-        dtype: object, 0    123
-        1    999
-        dtype: object],
-        codes=   0  1  2
-        0  0  0  0
-        1  1  0  1)
+        MultiIndex([('X', ' ', '123'),
+                    ('Y', ' ', '999')],
+                   )
         """
         if expand is not True:
             raise NotImplementedError(
@@ -2375,15 +2369,9 @@ def rpartition(self, sep=" ", expand=True):
         Which will create a MultiIndex:
 
         >>> idx.str.rpartition()
-        MultiIndex(levels=[0    X
-        1    Y
-        dtype: object, 0
-        dtype: object, 0    123
-        1    999
-        dtype: object],
-        codes=   0  1  2
-        0  0  0  0
-        1  1  0  1)
+        MultiIndex([('X', ' ', '123'),
+                    ('Y', ' ', '999')],
+                   )
         """
         if expand is not True:
             raise NotImplementedError(

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -7248,23 +7248,15 @@ def from_pandas(obj, nan_as_null=None):
                 (3, 2),
                 (4, 2),
                 (5, 1)],
-            names=['x', 'y'])
+               names=['x', 'y'])
     >>> gmidx = cudf.from_pandas(pmidx)
     >>> gmidx
-    MultiIndex(levels=[0    1
-    1    3
-    2    4
-    3    5
-    dtype: int64, 0    1
-    1    2
-    2    5
-    dtype: int64],
-    codes=   x  y
-    0  0  0
-    1  0  2
-    2  1  1
-    3  2  1
-    4  3  0)
+    MultiIndex([(1, 1),
+                (1, 5),
+                (3, 2),
+                (4, 2),
+                (5, 1)],
+               names=['x', 'y'])
     >>> type(gmidx)
     <class 'cudf.core.multiindex.MultiIndex'>
     >>> type(pmidx)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
@@ -119,20 +119,12 @@ def size(self):
         ...                 names=["x", "y"],
         ...             )
         >>> midx
-        MultiIndex(levels=[0       a
-        1       b
-        2       c
-        3    None
-        dtype: object, 0       1
-        1    None
-        2       5
-        dtype: object],
-        codes=   x  y
-        0  0  0
-        1  0  2
-        2  1  1
-        3  2  1
-        4  3  0)
+        MultiIndex([( 'a',  '1'),
+                    ( 'a',  '5'),
+                    ( 'b', <NA>),
+                    ( 'c', <NA>),
+                    (<NA>,  '1')],
+                   names=['x', 'y'])
         >>> midx.size
         5
         """

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -126,14 +126,9 @@ def __init__(
         UInt64Index([1, 2, 3], dtype='uint64', name='a')
 
         >>> cudf.Index(cudf.DataFrame({"a":[1, 2], "b":[2, 3]}))
-        MultiIndex(levels=[0    1
-        1    2
-        dtype: int64, 0    2
-        1    3
-        dtype: int64],
-        codes=   a  b
-        0  0  0
-        1  1  1)
+        MultiIndex([(1, 2),
+                    (2, 3)],
+                  names=['a', 'b'])
         """
         pass
 
@@ -407,31 +402,17 @@ def dropna(self, how="any"):
         ...         names=["x", "y"],
         ...     )
         >>> midx
-        MultiIndex(levels=[0       1
-        1    null
-        2       4
-        3    null
-        dtype: int64, 0    1
-        1    2
-        2    5
-        dtype: int64],
-        codes=   x  y
-        0  0  0
-        1  0  2
-        2  1  1
-        3  2  1
-        4  3  0)
+        MultiIndex([(   1, 1),
+                    (   1, 5),
+                    (<NA>, 2),
+                    (   4, 2),
+                    (<NA>, 1)],
+                   names=['x', 'y'])
         >>> midx.dropna()
-        MultiIndex(levels=[0    1
-        1    4
-        dtype: int64, 0    1
-        1    2
-        2    5
-        dtype: int64],
-        codes=   x  y
-        0  0  0
-        1  0  2
-        2  1  1)
+        MultiIndex([(1, 1),
+                    (1, 5),
+                    (4, 2)],
+                   names=['x', 'y'])
         """
         return super().dropna(how=how)
 
@@ -516,16 +497,11 @@ def set_names(self, names, level=None, inplace=False):
         >>> idx = cudf.MultiIndex.from_product([['python', 'cobra'],
         ... [2018, 2019]])
         >>> idx
-        MultiIndex(levels=[0     cobra
-        1    python
-        dtype: object, 0    2018
-        1    2019
-        dtype: int64],
-        codes=   0  1
-        0  1  0
-        1  1  1
-        2  0  0
-        3  0  1)
+        MultiIndex([('python', 2018),
+                    ('python', 2019),
+                    ( 'cobra', 2018),
+                    ( 'cobra', 2019)],
+                   )
         >>> idx.names
         FrozenList([None, None])
         >>> idx.set_names(['kind', 'year'], inplace=True)
@@ -622,20 +598,12 @@ def argsort(self, ascending=True, **kwargs):
         ...      names=["x", "y"],
         ... )
         >>> index
-        MultiIndex(levels=[0     1
-        1     3
-        2     4
-        3   -10
-        dtype: int64, 0     1
-        1    11
-        2     5
-        dtype: int64],
-        codes=   x  y
-        0  0  0
-        1  0  2
-        2  1  1
-        3  2  1
-        4  3  0)
+        MultiIndex([(  1,  1),
+                    (  1,  5),
+                    (  3, 11),
+                    (  4, 11),
+                    (-10,  1)],
+                   names=['x', 'y'])
         >>> index.argsort()
         array([4, 0, 1, 2, 3], dtype=int32)
         >>> index.argsort(ascending=False)
@@ -980,50 +948,26 @@ def sort_values(self, return_indexer=False, ascending=True, key=None):
         ...      names=["x", "y"],
         ... )
         >>> midx
-        MultiIndex(levels=[0     1
-        1     3
-        2     4
-        3   -10
-        dtype: int64, 0     1
-        1    11
-        2     5
-        dtype: int64],
-        codes=   x  y
-        0  0  0
-        1  0  2
-        2  1  1
-        3  2  1
-        4  3  0)
+        MultiIndex([(  1,  1),
+                    (  1,  5),
+                    (  3, 11),
+                    (  4, 11),
+                    (-10,  1)],
+                   names=['x', 'y'])
         >>> midx.sort_values()
-        MultiIndex(levels=[0     1
-        1     3
-        2     4
-        3   -10
-        dtype: int64, 0     1
-        1    11
-        2     5
-        dtype: int64],
-        codes=   x  y
-        4  3  0
-        0  0  0
-        1  0  2
-        2  1  1
-        3  2  1)
+        MultiIndex([(-10,  1),
+                    (  1,  1),
+                    (  1,  5),
+                    (  3, 11),
+                    (  4, 11)],
+                   names=['x', 'y'])
         >>> midx.sort_values(ascending=False)
-        MultiIndex(levels=[0     1
-        1     3
-        2     4
-        3   -10
-        dtype: int64, 0     1
-        1    11
-        2     5
-        dtype: int64],
-        codes=   x  y
-        3  2  1
-        2  1  1
-        1  0  2
-        0  0  0
-        4  3  0)
+        MultiIndex([(  4, 11),
+                    (  3, 11),
+                    (  1,  5),
+                    (  1,  1),
+                    (-10,  1)],
+                   names=['x', 'y'])
         """
         if key is not None:
             raise NotImplementedError("key parameter is not yet implemented.")
@@ -1138,16 +1082,18 @@ def join(
         >>> lhs = cudf.DataFrame(
         ...     {"a":[2, 3, 1], "b":[3, 4, 2]}).set_index(['a', 'b']
         ... ).index
+        >>> lhs
+        MultiIndex([(2, 3),
+                    (3, 4),
+                    (1, 2)],
+                   names=['a', 'b'])
         >>> rhs = cudf.DataFrame({"a":[1, 4, 3]}).set_index('a').index
+        >>> rhs
+        Int64Index([1, 4, 3], dtype='int64', name='a')
         >>> lhs.join(rhs, how='inner')
-        MultiIndex(levels=[0    1
-        1    3
-        dtype: int64, 0    2
-        1    4
-        dtype: int64],
-        codes=   a  b
-        0  1  1
-        1  0  0)
+        MultiIndex([(3, 4),
+                    (1, 2)],
+                   names=['a', 'b'])
         """
 
         if isinstance(self, cudf.MultiIndex) and isinstance(