From 9e35bce78b1a6c367b0b373e5c29cfd841296e13 Mon Sep 17 00:00:00 2001 From: Joel Ostblom Date: Wed, 17 Nov 2021 18:57:33 -0800 Subject: [PATCH 1/6] Represent pandas ordered categoricals as ordinal data --- altair/utils/core.py | 11 ++++++++--- tests/vegalite/v4/tests/test_api.py | 22 ++++++++++++++++++---- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/altair/utils/core.py b/altair/utils/core.py index 53785b174..a82dd3e03 100644 --- a/altair/utils/core.py +++ b/altair/utils/core.py @@ -193,8 +193,6 @@ def infer_vegalite_type(data): # Otherwise, infer based on the dtype of the input typ = infer_dtype(data) - # TODO: Once this returns 'O', please update test_select_x and test_select_y in test_api.py - if typ in [ "floating", "mixed-integer-float", @@ -203,6 +201,8 @@ def infer_vegalite_type(data): "complex", ]: return "quantitative" + elif typ == "categorical" and data.cat.ordered: + return ("ordinal", data.cat.categories.tolist()) elif typ in ["string", "bytes", "categorical", "boolean", "mixed", "unicode"]: return "nominal" elif typ in [ @@ -316,8 +316,9 @@ def to_list_if_array(val): for col_name, dtype in df.dtypes.items(): if str(dtype) == "category": - # XXXX: work around bug in to_json for categorical types + # Work around bug in to_json for categorical types in older versions of pandas # https://github.com/pydata/pandas/issues/10778 + # https://github.com/altair-viz/altair/pull/2170 col = df[col_name].astype(object) df[col_name] = col.where(col.notnull(), None) elif str(dtype) == "string": @@ -527,6 +528,10 @@ def parse_shorthand( if isinstance(data, pd.DataFrame) and "type" not in attrs: if "field" in attrs and attrs["field"] in data.columns: attrs["type"] = infer_vegalite_type(data[attrs["field"]]) + # Ordinal dataframe columns return the type and sort order as a tuple + if isinstance(attrs["type"], tuple): + attrs["sort"] = attrs["type"][1] + attrs["type"] = attrs["type"][0] return attrs diff --git a/tests/vegalite/v4/tests/test_api.py b/tests/vegalite/v4/tests/test_api.py index 66ec78163..932868736 100644 --- a/tests/vegalite/v4/tests/test_api.py +++ b/tests/vegalite/v4/tests/test_api.py @@ -118,6 +118,7 @@ def test_chart_infer_types(): "x": pd.date_range("2012", periods=10, freq="Y"), "y": range(10), "c": list("abcabcabca"), + "s": pd.Categorical([1, 2] * 5, categories=[2, 1], ordered=True), } ) @@ -129,32 +130,45 @@ def _check_encodings(chart): assert dct["encoding"]["y"]["field"] == "y" assert dct["encoding"]["color"]["type"] == "nominal" assert dct["encoding"]["color"]["field"] == "c" + assert dct["encoding"]["size"]["type"] == "ordinal" + assert dct["encoding"]["size"]["field"] == "s" + assert dct["encoding"]["size"]["sort"] == [2, 1] # Pass field names by keyword - chart = alt.Chart(data).mark_point().encode(x="x", y="y", color="c") + chart = alt.Chart(data).mark_point().encode(x="x", y="y", color="c", size="s") _check_encodings(chart) # pass Channel objects by keyword chart = ( alt.Chart(data) .mark_point() - .encode(x=alt.X("x"), y=alt.Y("y"), color=alt.Color("c")) + .encode(x=alt.X("x"), y=alt.Y("y"), color=alt.Color("c"), size=alt.Size("s")) ) _check_encodings(chart) # pass Channel objects by value - chart = alt.Chart(data).mark_point().encode(alt.X("x"), alt.Y("y"), alt.Color("c")) + chart = ( + alt.Chart(data) + .mark_point() + .encode(alt.X("x"), alt.Y("y"), alt.Color("c"), alt.Size("s")) + ) _check_encodings(chart) # override default types chart = ( alt.Chart(data) .mark_point() - .encode(alt.X("x", type="nominal"), alt.Y("y", type="ordinal")) + .encode( + alt.X("x", type="nominal"), + alt.Y("y", type="ordinal"), + alt.Size("s", type="nominal", sort=None), + ) ) dct = chart.to_dict() assert dct["encoding"]["x"]["type"] == "nominal" assert dct["encoding"]["y"]["type"] == "ordinal" + assert dct["encoding"]["size"]["type"] == "nominal" + assert dct["encoding"]["size"]["sort"] is None @pytest.mark.parametrize( From 7d42367f4379177cd71624974301dfcb2d9ceda2 Mon Sep 17 00:00:00 2001 From: Joel Ostblom Date: Fri, 6 Jan 2023 11:51:25 +0100 Subject: [PATCH 2/6] Move new test to v5 from v4 --- tests/vegalite/v4/tests/test_api.py | 22 ++++------------------ tests/vegalite/v5/tests/test_api.py | 22 ++++++++++++++++++---- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/tests/vegalite/v4/tests/test_api.py b/tests/vegalite/v4/tests/test_api.py index 932868736..66ec78163 100644 --- a/tests/vegalite/v4/tests/test_api.py +++ b/tests/vegalite/v4/tests/test_api.py @@ -118,7 +118,6 @@ def test_chart_infer_types(): "x": pd.date_range("2012", periods=10, freq="Y"), "y": range(10), "c": list("abcabcabca"), - "s": pd.Categorical([1, 2] * 5, categories=[2, 1], ordered=True), } ) @@ -130,45 +129,32 @@ def _check_encodings(chart): assert dct["encoding"]["y"]["field"] == "y" assert dct["encoding"]["color"]["type"] == "nominal" assert dct["encoding"]["color"]["field"] == "c" - assert dct["encoding"]["size"]["type"] == "ordinal" - assert dct["encoding"]["size"]["field"] == "s" - assert dct["encoding"]["size"]["sort"] == [2, 1] # Pass field names by keyword - chart = alt.Chart(data).mark_point().encode(x="x", y="y", color="c", size="s") + chart = alt.Chart(data).mark_point().encode(x="x", y="y", color="c") _check_encodings(chart) # pass Channel objects by keyword chart = ( alt.Chart(data) .mark_point() - .encode(x=alt.X("x"), y=alt.Y("y"), color=alt.Color("c"), size=alt.Size("s")) + .encode(x=alt.X("x"), y=alt.Y("y"), color=alt.Color("c")) ) _check_encodings(chart) # pass Channel objects by value - chart = ( - alt.Chart(data) - .mark_point() - .encode(alt.X("x"), alt.Y("y"), alt.Color("c"), alt.Size("s")) - ) + chart = alt.Chart(data).mark_point().encode(alt.X("x"), alt.Y("y"), alt.Color("c")) _check_encodings(chart) # override default types chart = ( alt.Chart(data) .mark_point() - .encode( - alt.X("x", type="nominal"), - alt.Y("y", type="ordinal"), - alt.Size("s", type="nominal", sort=None), - ) + .encode(alt.X("x", type="nominal"), alt.Y("y", type="ordinal")) ) dct = chart.to_dict() assert dct["encoding"]["x"]["type"] == "nominal" assert dct["encoding"]["y"]["type"] == "ordinal" - assert dct["encoding"]["size"]["type"] == "nominal" - assert dct["encoding"]["size"]["sort"] is None @pytest.mark.parametrize( diff --git a/tests/vegalite/v5/tests/test_api.py b/tests/vegalite/v5/tests/test_api.py index b2e5aa159..684f64a4c 100644 --- a/tests/vegalite/v5/tests/test_api.py +++ b/tests/vegalite/v5/tests/test_api.py @@ -123,6 +123,7 @@ def test_chart_infer_types(): "x": pd.date_range("2012", periods=10, freq="Y"), "y": range(10), "c": list("abcabcabca"), + "s": pd.Categorical([1, 2] * 5, categories=[2, 1], ordered=True), } ) @@ -134,32 +135,45 @@ def _check_encodings(chart): assert dct["encoding"]["y"]["field"] == "y" assert dct["encoding"]["color"]["type"] == "nominal" assert dct["encoding"]["color"]["field"] == "c" + assert dct["encoding"]["size"]["type"] == "ordinal" + assert dct["encoding"]["size"]["field"] == "s" + assert dct["encoding"]["size"]["sort"] == [2, 1] # Pass field names by keyword - chart = alt.Chart(data).mark_point().encode(x="x", y="y", color="c") + chart = alt.Chart(data).mark_point().encode(x="x", y="y", color="c", size="s") _check_encodings(chart) # pass Channel objects by keyword chart = ( alt.Chart(data) .mark_point() - .encode(x=alt.X("x"), y=alt.Y("y"), color=alt.Color("c")) + .encode(x=alt.X("x"), y=alt.Y("y"), color=alt.Color("c"), size=alt.Size("s")) ) _check_encodings(chart) # pass Channel objects by value - chart = alt.Chart(data).mark_point().encode(alt.X("x"), alt.Y("y"), alt.Color("c")) + chart = ( + alt.Chart(data) + .mark_point() + .encode(alt.X("x"), alt.Y("y"), alt.Color("c"), alt.Size("s")) + ) _check_encodings(chart) # override default types chart = ( alt.Chart(data) .mark_point() - .encode(alt.X("x", type="nominal"), alt.Y("y", type="ordinal")) + .encode( + alt.X("x", type="nominal"), + alt.Y("y", type="ordinal"), + alt.Size("s", type="nominal", sort=None), + ) ) dct = chart.to_dict() assert dct["encoding"]["x"]["type"] == "nominal" assert dct["encoding"]["y"]["type"] == "ordinal" + assert dct["encoding"]["size"]["type"] == "nominal" + assert dct["encoding"]["size"]["sort"] is None @pytest.mark.parametrize( From 3159663ea70a218f55a6ab3609572e80f73c4ec2 Mon Sep 17 00:00:00 2001 From: Joel Ostblom Date: Fri, 6 Jan 2023 12:38:18 +0100 Subject: [PATCH 3/6] Add notes about categorical sorting to the docs --- doc/user_guide/encodings/channels.rst | 12 +++++++----- doc/user_guide/encodings/index.rst | 16 ++++++++++------ 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/doc/user_guide/encodings/channels.rst b/doc/user_guide/encodings/channels.rst index 057db9c71..073580b21 100644 --- a/doc/user_guide/encodings/channels.rst +++ b/doc/user_guide/encodings/channels.rst @@ -110,7 +110,7 @@ We map the ``symbol`` variable to ``detail`` to use them to group lines. Order ~~~~~ -The `order` option and :class:`Order` channel can sort how marks are drawn on the chart. +The ``order`` option and :class:`Order` channel can sort how marks are drawn on the chart. For stacked marks, this controls the order of components of the stack. Here, the elements of each bar are sorted alphabetically by the name of the nominal data in the color channel. @@ -144,9 +144,6 @@ The order can be reversed by changing the sort option to `descending`. order=alt.Order("site", sort="descending") ) -If we want to sort stacked segments in a custom order, we can `follow the approach in this issue comment `_, although there might be edge cases where this is not fully supported. This also makes the order of the segments align with the order colors shows up in a legend that uses custom sorting for the color domain. - - The same approach works for other mark types, like stacked areas charts. .. altair-plot:: @@ -163,7 +160,12 @@ The same approach works for other mark types, like stacked areas charts. order=alt.Order("site", sort="ascending") ) -For line marks, the `order` channel encodes the order in which data points are connected. This can be useful for creating a scatter plot that draws lines between the dots using a different field than the x and y axes. +Note that unlike the ``sort`` parameter to positional encoding channels, +the :class:`Order` channel cannot take a list of values to sort by +and is not automatically sorted when an ordered pandas categorical column is passed. +If we want to sort stacked segments in a custom order, we can `follow the approach in this issue comment `_, although there might be edge cases where this is not fully supported. This workaround also makes the order of the segments align with the order that the colors shows up in a legend that uses custom sorting for the color domain. + +For line marks, the :class:`Order` channel encodes the order in which data points are connected. This can be useful for creating a scatter plot that draws lines between the dots using a different field than the x and y axes. .. altair-plot:: diff --git a/doc/user_guide/encodings/index.rst b/doc/user_guide/encodings/index.rst index b46acf6b6..695cf7628 100644 --- a/doc/user_guide/encodings/index.rst +++ b/doc/user_guide/encodings/index.rst @@ -318,18 +318,22 @@ Sort Option ~~~~~~~~~~~ Some channels accept a :class:`sort` option which determines the -order of the scale being used for the channel. There are a number of different -sort options available: +order of the scale being used for the channel. +By default the scale is sorted in ascending alphabetical order, +unless an `ordered pandas categorical column `_ is passed +in which case Altair will use the column's inherent order to sort the scale. +There are a number of different +options available to change the sort order: - ``sort='ascending'`` (Default) will sort the field's value in ascending order. - for string data, this uses standard alphabetical order. + For string data, this uses standard alphabetical order. - ``sort='descending'`` will sort the field's value in descending order -- passing the name of an encoding channel to ``sort``, such as ``"x"`` or ``"y"``, allows for +- Passing the name of an encoding channel to ``sort``, such as ``"x"`` or ``"y"``, allows for sorting by that channel. An optional minus prefix can be used for a descending sort. For example ``sort='-x'`` would sort by the x channel in descending order. -- passing a list to ``sort`` allows you to explicitly set the order in which +- Passing a list to ``sort`` allows you to explicitly set the order in which you would like the encoding to appear -- passing a :class:`EncodingSortField` class to ``sort`` allows you to sort +- Passing a :class:`EncodingSortField` class to ``sort`` allows you to sort an axis by the value of some other field in the dataset. Here is an example of applying these five different sort approaches on the From 54f5b0a0d099339a348260bd4da4ed8b4436faf2 Mon Sep 17 00:00:00 2001 From: Joel Ostblom Date: Fri, 6 Jan 2023 12:44:04 +0100 Subject: [PATCH 4/6] Note that specifying the type explicitly remove the autodetection of the order --- doc/user_guide/encodings/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/user_guide/encodings/index.rst b/doc/user_guide/encodings/index.rst index 695cf7628..32073259e 100644 --- a/doc/user_guide/encodings/index.rst +++ b/doc/user_guide/encodings/index.rst @@ -320,7 +320,7 @@ Sort Option Some channels accept a :class:`sort` option which determines the order of the scale being used for the channel. By default the scale is sorted in ascending alphabetical order, -unless an `ordered pandas categorical column `_ is passed +unless an `ordered pandas categorical column `_ is passed (without an explicit type specification) in which case Altair will use the column's inherent order to sort the scale. There are a number of different options available to change the sort order: From 9ad570b841e0754c4d91bc58e6548cb1c0351eae Mon Sep 17 00:00:00 2001 From: Joel Ostblom Date: Thu, 19 Jan 2023 14:38:44 +0100 Subject: [PATCH 5/6] Remove automatic sort order of categorical data if a non-ordinal type is specified --- altair/utils/schemapi.py | 8 ++++++++ tests/vegalite/v5/tests/test_api.py | 4 ++-- tools/schemapi/schemapi.py | 8 ++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index 10cb16e21..368dd0049 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -364,6 +364,14 @@ def to_dict(self, validate=True, ignore=None, context=None): # parsed_shorthand is removed from context if it exists so that it is # not passed to child to_dict function calls parsed_shorthand = context.pop("parsed_shorthand", {}) + # Prevent that pandas categorical data is automatically sorted + # when a non-ordinal data type is specifed manually + if "sort" in parsed_shorthand and kwds["type"] not in [ + "ordinal", + Undefined, + ]: + parsed_shorthand.pop("sort") + kwds.update( { k: v diff --git a/tests/vegalite/v5/tests/test_api.py b/tests/vegalite/v5/tests/test_api.py index 684f64a4c..12e8b6668 100644 --- a/tests/vegalite/v5/tests/test_api.py +++ b/tests/vegalite/v5/tests/test_api.py @@ -166,14 +166,14 @@ def _check_encodings(chart): .encode( alt.X("x", type="nominal"), alt.Y("y", type="ordinal"), - alt.Size("s", type="nominal", sort=None), + alt.Size("s", type="nominal"), ) ) dct = chart.to_dict() assert dct["encoding"]["x"]["type"] == "nominal" assert dct["encoding"]["y"]["type"] == "ordinal" assert dct["encoding"]["size"]["type"] == "nominal" - assert dct["encoding"]["size"]["sort"] is None + assert "sort" not in dct["encoding"]["size"] @pytest.mark.parametrize( diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index f6e85a4b2..89b8f03b2 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -362,6 +362,14 @@ def to_dict(self, validate=True, ignore=None, context=None): # parsed_shorthand is removed from context if it exists so that it is # not passed to child to_dict function calls parsed_shorthand = context.pop("parsed_shorthand", {}) + # Prevent that pandas categorical data is automatically sorted + # when a non-ordinal data type is specifed manually + if "sort" in parsed_shorthand and kwds["type"] not in [ + "ordinal", + Undefined, + ]: + parsed_shorthand.pop("sort") + kwds.update( { k: v From dc74f9e14d8da3fa7fb6848b9bc73dbef89f285f Mon Sep 17 00:00:00 2001 From: Mattijn van Hoek Date: Wed, 25 Jan 2023 20:34:31 +0100 Subject: [PATCH 6/6] Update altair/utils/core.py --- altair/utils/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/altair/utils/core.py b/altair/utils/core.py index a82dd3e03..c47b9a04d 100644 --- a/altair/utils/core.py +++ b/altair/utils/core.py @@ -528,7 +528,7 @@ def parse_shorthand( if isinstance(data, pd.DataFrame) and "type" not in attrs: if "field" in attrs and attrs["field"] in data.columns: attrs["type"] = infer_vegalite_type(data[attrs["field"]]) - # Ordinal dataframe columns return the type and sort order as a tuple + # ordered categorical dataframe columns return the type and sort order as a tuple if isinstance(attrs["type"], tuple): attrs["sort"] = attrs["type"][1] attrs["type"] = attrs["type"][0]