ENH: allow JSON (de)serialization of ExtensionDtypes #44722

jmg-duarte · 2021-12-02T15:10:38Z

xref Serialization / Deserialization of ExtensionArrays #20612
closes BUG: Python's date and time objects do not roundtrip with JSON orient='table' #44705
tests added / passed
ensure all linting tests pass, see here for how to run them

This PR allows ExtensionDtypes to be (de)serialized using .to_json(orient='table').

Example:

import datetime as dt
from typing import Any, Callable, Optional, Sequence
from typing import Tuple
from typing import Union
from typing import cast

import numpy as np
from pandas.api.extensions import ExtensionArray
from pandas.api.extensions import ExtensionDtype
from pandas.api.types import pandas_dtype
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas._typing import Dtype


@register_extension_dtype
class DateDtype(ExtensionDtype):
    @property
    def type(self):
        return dt.date

    @property
    def name(self):
        return "DateDtype"

    @classmethod
    def construct_from_string(cls, string: str):
        if not isinstance(string, str):
            raise TypeError(
                f"'construct_from_string' expects a string, got {type(string)}"
            )

        if string == cls.__name__:
            return cls()
        else:
            raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")

    @classmethod
    def construct_array_type(cls):
        return DateArray

    @property
    def na_value(self):
        return dt.date.min

    def __repr__(self) -> str:
        return self.name


class DateArray(ExtensionArray):
    def __init__(
        self,
        dates: Union[
            Sequence[dt.date], Tuple[np.ndarray, np.ndarray, np.ndarray], np.ndarray
        ],
    ) -> None:
        ldates = len(dates)
        if isinstance(dates, list):
            # pre-allocate the arrays since we know the size before hand
            self._year = np.zeros(ldates, dtype=np.uint16)  # 65535 (0, 9999)
            self._month = np.zeros(ldates, dtype=np.uint8)  # 255 (1, 31)
            self._day = np.zeros(ldates, dtype=np.uint8)  # 255 (1, 12)
            # populate them
            for i, (y, m, d) in enumerate(
                map(lambda date: (date.year, date.month, date.day), dates)
            ):
                self._year[i] = y
                self._month[i] = m
                self._day[i] = d

        elif isinstance(dates, tuple):
            # only support triples
            if ldates != 3:
                raise ValueError("only triples are valid")
            # check if all elements have the same type
            if not isinstance(dates[0], np.ndarray):
                raise TypeError(
                    f"tuple 0-element has an invalid type: {type(dates[0])}"
                )
            if not isinstance(dates[1], np.ndarray):
                raise TypeError(
                    f"tuple 1-element has an invalid type: {type(dates[1])}"
                )
            if not isinstance(dates[2], np.ndarray):
                raise TypeError(
                    f"tuple 2-element has an invalid type: {type(dates[2])}"
                )

            ly, lm, ld = (len(cast(np.ndarray, d)) for d in dates)
            if not ly == lm == ld:
                raise ValueError(
                    f"tuple members must have the same length: {(ly, lm, ld)}"
                )

            self._year = dates[0].astype(np.uint16)
            self._month = dates[1].astype(np.uint8)
            self._day = dates[2].astype(np.uint8)

        elif isinstance(dates, np.ndarray) and dates.dtype == "U10":
            self._year = np.zeros(ldates, dtype=np.uint16)  # 65535 (0, 9999)
            self._month = np.zeros(ldates, dtype=np.uint8)  # 255 (1, 31)
            self._day = np.zeros(ldates, dtype=np.uint8)  # 255 (1, 12)

            for i, dstr in np.ndenumerate(np.char.split(dates, sep="-")):
                self._year[i] = int(dstr[0])
                self._month[i] = int(dstr[1])
                self._day[i] = int(dstr[2])

        else:
            raise TypeError(f"{type(dates)} is not supported")

    @property
    def dtype(self) -> ExtensionDtype:
        return DateDtype()

    def astype(self, dtype, copy=True):
        dtype = pandas_dtype(dtype)

        if isinstance(dtype, DateDtype):
            data = self.copy() if copy else self
        else:
            # NOTE(duarte,30/11/21): na_value is probably wrong
            data = self.to_numpy(dtype=dtype, copy=copy, na_value=dt.date.min)

        return data

    @property
    def nbytes(self) -> int:
        return self._year.nbytes + self._month.nbytes + self._day.nbytes

    def __len__(self) -> int:
        return len(self._year)  # all 3 arrays are enforced to have the same length

    def __getitem__(self, item) -> Union[dt.date, "DateArray"]:
        if isinstance(item, int):
            return dt.date(self._year[item], self._month[item], self._day[item])
        else:
            raise NotImplementedError("only ints are supported as indexes")

    def __setitem__(self, key: Union[int, np.ndarray], value) -> None:
        if not isinstance(key, int):
            raise NotImplementedError("only ints are supported as indexes")

        if not isinstance(value, dt.date):
            raise TypeError("you can only set datetime.date types")

        self._year[key] = value.year
        self._month[key] = value.month
        self._day[key] = value.day

    def __repr__(self) -> str:
        return f"DateArray{list(zip(self._year, self._month, self._day))}"

    def copy(self) -> "DateArray":
        return DateArray((self._year.copy(), self._month.copy(), self._day.copy()))

    def isna(self) -> np.ndarray:
        return np.logical_and(
            np.logical_and(
                self._year == dt.date.min.year, self._month == dt.date.min.month
            ),
            self._day == dt.date.min.day,
        )

    @classmethod
    def _from_sequence(cls, scalars, *, dtype: Optional[Dtype] = None, copy=False):
        if isinstance(scalars, dt.date):
            pass
        elif isinstance(scalars, DateArray):
            pass
        elif isinstance(scalars, np.ndarray):
            scalars = scalars.astype("U10")  # 10 chars for yyyy-mm-dd
            return DateArray(scalars)

import pandas as pd
import datetime as dt

da = DateArray([dt.date.today()])
df = pd.DataFrame({'da': da})
df_rtt = pd.read_json(df.to_json(orient='table'), orient='table')
# df.equals(df_rtt) won't work because:
# * the indexes are different (already happened)
# * the DateArray class only implements ints as indexes (its only a PoC)
assert df.dtypes == df_rtt.dtypes

Dr-Irv · 2021-12-02T15:33:39Z

@jmg-duarte Can you do the following:

Add a test for this new functionality. You might want to follow the pattern in pandas/test/io/json/test_json_table_schema.py, but create a new file that uses your example extension type (date/time) and also do something with Decimal as well. You can import an extension dtype implementation for Decimal in the package pandas.tests.extension.decimal.
Update the documentation for read_table() to discuss this new functionality.
Update the what's new document to reflect this new functionality

jmg-duarte · 2021-12-02T15:43:54Z

Update the documentation for read_table() to discuss this new functionality.

Do you mean read_json()?

Dr-Irv · 2021-12-02T15:50:12Z

Update the documentation for read_table() to discuss this new functionality.

Do you mean read_json()?

Yes. Also, to_json() and the docs corresponding to https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html?highlight=read_json#table-schema, and more generally https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html?highlight=read_json#json

We'll need to state that for user-implemented extension arrays, we're possibly generating JSON Table schema that is non-conforming. (e.g., if you do the test for Decimal, I would anticipate that would happen). It's probably worth incrementing the schema version as well.

Dr-Irv · 2021-12-02T15:53:37Z

One other note - I see that tests are already failing. You might want to run the existing tests for the table schema locally before pushing your next commit:

pytest -n 4 pandas/tests/io/json/test_json_table_schema.py

(The -n 4 argument runs the tests in parallel to give you faster results)

jmg-duarte · 2021-12-02T17:00:59Z

A little checklist to track the current progress:

Dr-Irv · 2021-12-02T17:14:48Z

@attack68 You might be the best person to review this PR once the checklist at #44722 (comment) is complete. If not, do you have other suggestions?

jmg-duarte · 2021-12-02T20:00:26Z

I just tested Decimal on master and while it works, it does not make the roundtrip.

In [1]: import pandas as pd

In [2]: from pandas.tests.extension.decimal import DecimalArray

In [3]: from decimal import Decimal

In [4]: pd.read_json(pd.DataFrame({'dec': DecimalArray([Decimal(10)])}).to_json(orient='table'), orient='table')
Out[4]: 
    dec
0  10.0

In [5]: pd.read_json(pd.DataFrame({'dec': DecimalArray([Decimal(10)])}).to_json(orient='table'), orient='table').dtypes
Out[5]: 
dec    object
dtype: object

In [6]: pd.DataFrame({'dec': DecimalArray([Decimal(10)])}).to_json(orient='table')
Out[6]: '{"schema":{"fields":[{"name":"index","type":"integer"},{"name":"dec","type":"string"}],"primaryKey":["index"],"pandas_version":"0.20.0"},"data":[{"index":0,"dec":10.0}]}'

Out of the box, the solution proposed in this PR will throw the following error:

TypeError: All values must be of type <class 'decimal.Decimal'>

Due to the following code:

pandas/pandas/tests/extension/decimal/array.py

Lines 89 to 91 in e992969

    
           @classmethod 
        
           def _from_sequence(cls, scalars, dtype=None, copy=False): 
        
               return cls(scalars)

With a bit of adapting I can make this work, however I'm open to feedback on how to handle this.

jmg-duarte · 2021-12-02T20:26:25Z

I've updated the to_json() documentation, I don't think read_json() needs it but I don't mind adding a reference to which method the user needs to override to tap into the serialization. Let me know your thoughts.

On another note, why don't these lines use the TABLE_SCHEMA_VERSION variable?

pandas/pandas/io/json/_table_schema.py

Lines 293 to 294 in e992969

    
           if version: 
        
               schema["pandas_version"] = "0.20.0"

Dr-Irv · 2021-12-02T21:38:35Z

With a bit of adapting I can make this work, however I'm open to feedback on how to handle this.

It's why I suggested using ExtensionArray._from_sequence_of_strings() for deserialization, ExtensionArray._formatter() for serialization, because you would then just put the string representation in the JSON, and those should be inverses of each other, and independent of the extension array implementations, you know the string representations of the underlying objects are there.

Dr-Irv · 2021-12-02T21:44:56Z

On another note, why don't these lines use the TABLE_SCHEMA_VERSION variable?

Seems like this was never noticed before. Can you move TABLE_SCHEMA_VERSION out of _json.py and into _table_schema.py as part of this PR?

Note in the docs in _json.py and _table_schema.py, there are example outputs shown that have that number in it, so when you change the version, you should also change the examples as well.

Dr-Irv · 2021-12-02T21:48:52Z

I've updated the to_json() documentation, I don't think read_json() needs it but I don't mind adding a reference to which method the user needs to override to tap into the serialization. Let me know your thoughts.

I think in the overview pages I referred to earlier is where you want to discuss how ExtensionArray is handled with respect to the table schema. For the API docs, you just need to indicate that if the DataFrame has a Series backed by a user-created ExtensionArray, then the output will not conform to the table schema standard because pandas will be outputting types that are not part of that standard, but that round-tripping (i.e., read_json(df.to_json(orient="table"), orient="table") will work.

jmg-duarte · 2021-12-02T22:36:02Z

It's why I suggested using ExtensionArray._from_sequence_of_strings() for deserialization, ExtensionArray._formatter() for serialization, because you would then just put the string representation in the JSON, and those should be inverses of each other, and independent of the extension array implementations, you know the string representations of the underlying objects are there.

Due to my lack of experience with pandas (and a bit of documentation too) I'm doing this in an ad-hoc approach - implement a bit, see what breaks and repeat.
I'll put breakpoints into the methods you suggest and work from there.

Seems like this was never noticed before. Can you move TABLE_SCHEMA_VERSION out of _json.py and into _table_schema.py as part of this PR?

Note in the docs in _json.py and _table_schema.py, there are example outputs shown that have that number in it, so when you change the version, you should also change the examples as well.

Sure thing, I will address it.

I think in the overview pages I referred to earlier is where you want to discuss how ExtensionArray is handled with respect to the table schema. For the API docs, you just need to indicate that if the DataFrame has a Series backed by a user-created ExtensionArray, then the output will not conform to the table schema standard because pandas will be outputting types that are not part of that standard, but that round-tripping (i.e., read_json(df.to_json(orient="table"), orient="table") will work.

I hadn't noticed the links were for "extra" documentation, I will also address this.

Dr-Irv · 2021-12-02T23:07:59Z

@jmg-duarte One other note, since this is your first PR. I don't think you should keep pushing each commit you do, as it fires off our continuous integration testing scripts, and you know that you are still working on things. Make sure the testing scripts work locally on your own machine, and when that's true, then you can push, and the full suite of tests will be run, possibly catching errors in different environments that don't happen on your own machine. When you make the doc changes, build them locally to make sure they look right, before pushing.

jmg-duarte · 2021-12-04T15:10:29Z

@Dr-Irv all tests are passing besides the docs, could you (or someone else) help me figure out the problem? The changes I made built locally

Dr-Irv

This is looking good. Some minor changes. I think the CI failure is intermittent when the docs are built. There are other PR with no doc changes that show the same error.

doc/source/user_guide/io.rst

doc/source/whatsnew/v1.3.5.rst

pandas/tests/io/json/test_json_table_schema_ext_dtype.py

Dr-Irv · 2021-12-04T17:44:38Z

@Dr-Irv all tests are passing besides the docs, could you (or someone else) help me figure out the problem? The changes I made built locally

I created a new issue for the CI failure: #44754

jreback

pls don't change the version like you are doing. a number of comments.

doc/source/development/developer.rst

doc/source/user_guide/io.rst

pandas/core/generic.py

pandas/io/json/_json.py

pandas/io/json/_table_schema.py

pandas/tests/extension/decimal/array.py

pandas/tests/io/json/test_pandas.py

Dr-Irv · 2021-12-06T22:22:40Z

@jmg-duarte in the future, can you not force push your branch, but just push a commit that reflects your latest changes? Thanks

jmg-duarte · 2021-12-07T08:08:11Z

I'm unsure what might have caused test-arm to fail as it didn't before

Dr-Irv

I've added some comments that relate to the feedback from @jreback and should resolve his issues, and there is still an additional test to add at https://github.com/pandas-dev/pandas/pull/44722/files#r763388551

Once you address those, I will review again, and I can approve, but final approval will rest with @jreback

doc/source/user_guide/io.rst

pandas/core/generic.py

Dr-Irv · 2021-12-08T21:36:18Z

@jmg-duarte can you also look at https://github.com/pandas-dev/pandas/pull/44722/files#r763388551 . Idea is to explicitly create those extension types in addition to the Decimal and Date ones

Dr-Irv

I'm fine with this. Up to @jreback now

doc/source/user_guide/io.rst

jreback · 2021-12-09T23:38:01Z

pandas/tests/io/json/test_json_table_schema_ext_dtype.py

+            self._day == dt.date.min.day,
+        )
+
+    @classmethod


so if you are going to all the work of making an extension type like this then you should simply add it to the regular tests and import from there (you can disable any tests that don't work rn)

e.g. put it here: https://github.com/pandas-dev/pandas/tree/master/pandas/tests/extension/decimal

I'm not sure if I understand. You're suggesting that I move the extension to the decimal folder or create a date folder without a test_date.py?
Should the whole test_json_table_schema_ext_dtype.py be moved too?

No, I think you could put all the extension type code in pandas/tests/extension/date but leave the stuff that tests JSON reading where it is. You don't need to add a test_date.py - that can be a separate PR.

pandas/tests/io/json/test_pandas.py

doc/source/user_guide/io.rst

pandas/io/json/_table_schema.py

jreback · 2021-12-15T00:48:14Z

@jmg-duarte can you revert the 1.3.5 notes and fix conflicts on other, ping on green.

jreback · 2021-12-19T23:25:06Z

thanks @jmg-duarte very nice.

If you can followup with a test that deserializes a 0.20.0 format table schema would be great (it almost certainly works but would be great to have a specific test for this).

jmg-duarte · 2021-12-21T12:54:12Z

thanks @jmg-duarte very nice.

If you can followup with a test that deserializes a 0.20.0 format table schema would be great (it almost certainly works but would be great to have a specific test for this).

I didn't mess with any previous schema tests so they work.
I can't explicitly test for 0.20.0 in the version because we upgraded it to 1.4.0.

Am I missing something?

jreback · 2021-12-21T13:06:07Z

@jmg-duarte thats the point we need an explicit example that provides backward compatibility

make one in an older version of pandas and then save it in a file

jmg-duarte · 2021-12-21T13:10:40Z

Oh, you meant on reads. A terça, 21 de dez de 2021, 13:06, Jeff Reback ***@***.***> escreveu:

…

@jmg-duarte <https://github.com/jmg-duarte> thats the point we need an explicit example that provides backward compatibility make one in an older version of pandas and then save it in a file — Reply to this email directly, view it on GitHub <#44722 (comment)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/ADVCBSYJCX7FOOI3AC6WDQLUSB3UVANCNFSM5JHLVG2A> . You are receiving this because you were mentioned.Message ID: ***@***.***>

Dr-Irv requested changes Dec 4, 2021

View reviewed changes

doc/source/user_guide/io.rst Outdated Show resolved Hide resolved

doc/source/whatsnew/v1.3.5.rst Outdated Show resolved Hide resolved

pandas/tests/io/json/test_json_table_schema_ext_dtype.py Show resolved Hide resolved

jreback requested changes Dec 5, 2021

View reviewed changes

jreback added ExtensionArray Extending pandas with custom dtypes or arrays. IO JSON read_json, to_json, json_normalize Enhancement labels Dec 5, 2021

jreback changed the title ~~BUG: allow JSON (de)serialization of ExtensionDtypes~~ ENH: allow JSON (de)serialization of ExtensionDtypes Dec 5, 2021

BUG: allow ExtensionDtypes JSON (de)serialization

0a5a5c2

Merge master into bug/44705

6aace9b

Dr-Irv requested changes Dec 8, 2021

View reviewed changes

doc/source/user_guide/io.rst Outdated Show resolved Hide resolved

doc/source/user_guide/io.rst Outdated Show resolved Hide resolved

doc/source/user_guide/io.rst Outdated Show resolved Hide resolved

pandas/core/generic.py Outdated Show resolved Hide resolved

apply suggestions

29cd011

add tests over Int64 and String arrays

9d87d98

Dr-Irv approved these changes Dec 9, 2021

View reviewed changes

jreback requested changes Dec 9, 2021

View reviewed changes

jreback requested changes Dec 13, 2021

View reviewed changes

doc/source/user_guide/io.rst Outdated Show resolved Hide resolved

pandas/io/json/_table_schema.py Outdated Show resolved Hide resolved

pandas/io/json/_table_schema.py Outdated Show resolved Hide resolved

jreback added this to the 1.4 milestone Dec 15, 2021

José Duarte added 3 commits December 19, 2021 15:22

Merge branch 'master' into bug/44705

763322b

apply suggestions

6ad8577

fix version

a342ca7

jreback approved these changes Dec 19, 2021

View reviewed changes

jreback merged commit f2639df into pandas-dev:master Dec 19, 2021

phofl mentioned this pull request Dec 21, 2021

CI: Test experimental data manager failing due to new JSON format for ExtensionArray dtypes #44994

Closed

jreback mentioned this pull request Dec 21, 2021

TST: backwards compatible test for reading json table scheme version 0.20.0 #44995

Closed

jmg-duarte deleted the bug/44705 branch December 24, 2021 11:44

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

ENH: allow JSON (de)serialization of ExtensionDtypes #44722

ENH: allow JSON (de)serialization of ExtensionDtypes #44722

jmg-duarte commented Dec 2, 2021 •

edited by jreback

Loading

Dr-Irv commented Dec 2, 2021

jmg-duarte commented Dec 2, 2021 •

edited

Loading

Dr-Irv commented Dec 2, 2021

Dr-Irv commented Dec 2, 2021

jmg-duarte commented Dec 2, 2021 •

edited

Loading

Dr-Irv commented Dec 2, 2021

jmg-duarte commented Dec 2, 2021 •

edited

Loading

jmg-duarte commented Dec 2, 2021

Dr-Irv commented Dec 2, 2021

Dr-Irv commented Dec 2, 2021

Dr-Irv commented Dec 2, 2021

jmg-duarte commented Dec 2, 2021

Dr-Irv commented Dec 2, 2021

jmg-duarte commented Dec 4, 2021

Dr-Irv left a comment

Dr-Irv commented Dec 4, 2021

jreback left a comment

Dr-Irv commented Dec 6, 2021

jmg-duarte commented Dec 7, 2021

Dr-Irv left a comment

Dr-Irv commented Dec 8, 2021

Dr-Irv left a comment

jreback Dec 9, 2021

jmg-duarte Dec 10, 2021

Dr-Irv Dec 10, 2021

jreback commented Dec 15, 2021

jreback commented Dec 19, 2021

jmg-duarte commented Dec 21, 2021

jreback commented Dec 21, 2021

jmg-duarte commented Dec 21, 2021 via email

ENH: allow JSON (de)serialization of ExtensionDtypes #44722

ENH: allow JSON (de)serialization of ExtensionDtypes #44722

Conversation

jmg-duarte commented Dec 2, 2021 • edited by jreback Loading

Dr-Irv commented Dec 2, 2021

jmg-duarte commented Dec 2, 2021 • edited Loading

Dr-Irv commented Dec 2, 2021

Dr-Irv commented Dec 2, 2021

jmg-duarte commented Dec 2, 2021 • edited Loading

Dr-Irv commented Dec 2, 2021

jmg-duarte commented Dec 2, 2021 • edited Loading

jmg-duarte commented Dec 2, 2021

Dr-Irv commented Dec 2, 2021

Dr-Irv commented Dec 2, 2021

Dr-Irv commented Dec 2, 2021

jmg-duarte commented Dec 2, 2021

Dr-Irv commented Dec 2, 2021

jmg-duarte commented Dec 4, 2021

Dr-Irv left a comment

Choose a reason for hiding this comment

Dr-Irv commented Dec 4, 2021

jreback left a comment

Choose a reason for hiding this comment

Dr-Irv commented Dec 6, 2021

jmg-duarte commented Dec 7, 2021

Dr-Irv left a comment

Choose a reason for hiding this comment

Dr-Irv commented Dec 8, 2021

Dr-Irv left a comment

Choose a reason for hiding this comment

jreback Dec 9, 2021

Choose a reason for hiding this comment

jmg-duarte Dec 10, 2021

Choose a reason for hiding this comment

Dr-Irv Dec 10, 2021

Choose a reason for hiding this comment

jreback commented Dec 15, 2021

jreback commented Dec 19, 2021

jmg-duarte commented Dec 21, 2021

jreback commented Dec 21, 2021

jmg-duarte commented Dec 21, 2021 via email

jmg-duarte commented Dec 2, 2021 •

edited by jreback

Loading

jmg-duarte commented Dec 2, 2021 •

edited

Loading

jmg-duarte commented Dec 2, 2021 •

edited

Loading

jmg-duarte commented Dec 2, 2021 •

edited

Loading