Skip to content

Commit

Permalink
[SPARK-45164][PS] Remove deprecated Index APIs
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

This PR proposes to remove deprecated `Index` APIs from Pandas API on Spark.

### Why are the changes needed?

To follow the behavior of the latest Pandas.

See pandas-dev/pandas#37877 for `Index.asi8`

See pandas-dev/pandas#42113 for `Index.is_type_compatible`

### Does this PR introduce _any_ user-facing change?

`Index.asi8`, `Index.is_type_compatible` is removed. `Index.astype` and `Index.isin` can be used instead respectively.

### How was this patch tested?

The existing CI should pass.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #42926 from itholic/SPARK-45164.

Authored-by: Haejoon Lee <haejoon.lee@databricks.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
  • Loading branch information
itholic authored and dongjoon-hyun committed Sep 16, 2023
1 parent 1b7dbf7 commit 69ba1bd
Show file tree
Hide file tree
Showing 4 changed files with 2 additions and 210 deletions.
2 changes: 2 additions & 0 deletions python/docs/source/migration_guide/pyspark_upgrade.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ Upgrading from PySpark 3.5 to 4.0
* In Spark 4.0, ``squeeze`` parameter from ``ps.read_csv`` and ``ps.read_excel`` has been removed from pandas API on Spark.
* In Spark 4.0, ``null_counts`` parameter from ``DataFrame.info`` has been removed from pandas API on Spark, use ``show_counts`` instead.
* In Spark 4.0, the result of ``MultiIndex.append`` does not keep the index names from pandas API on Spark.
* In Spark 4.0, ``Index.asi8`` has been removed from pandas API on Spark, use ``Index.astype`` instead.
* In Spark 4.0, ``Index.is_type_compatible`` has been removed from pandas API on Spark, use ``Index.isin`` instead.
* In Spark 4.0, ``col_space`` parameter from ``DataFrame.to_latex`` and ``Series.to_latex`` has been removed from pandas API on Spark.


Expand Down
63 changes: 0 additions & 63 deletions python/pyspark/pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,44 +624,6 @@ def values(self) -> np.ndarray:
warnings.warn("We recommend using `{}.to_numpy()` instead.".format(type(self).__name__))
return self.to_numpy()

@property
def asi8(self) -> np.ndarray:
"""
Integer representation of the values.
.. warning:: We recommend using `Index.to_numpy()` instead.
.. note:: This method should only be used if the resulting NumPy ndarray is expected
to be small, as all the data is loaded into the driver's memory.
.. deprecated:: 3.4.0
Returns
-------
numpy.ndarray
An ndarray with int64 dtype.
Examples
--------
>>> ps.Index([1, 2, 3]).asi8
array([1, 2, 3])
Returns None for non-int64 dtype
>>> ps.Index(['a', 'b', 'c']).asi8 is None
True
"""
warnings.warn(
"Index.asi8 is deprecated and will be removed in 4.0.0. " "Use Index.astype instead.",
FutureWarning,
)
if isinstance(self.spark.data_type, IntegralType):
return self.to_numpy()
elif isinstance(self.spark.data_type, (TimestampType, TimestampNTZType)):
return np.array(list(map(lambda x: x.astype(np.int64), self.to_numpy())))
else:
return None

@property
def has_duplicates(self) -> bool:
"""
Expand Down Expand Up @@ -1118,31 +1080,6 @@ def is_object(self) -> bool:
"""
return is_object_dtype(self.dtype)

def is_type_compatible(self, kind: str) -> bool:
"""
Whether the index type is compatible with the provided type.
.. deprecated:: 3.4.0
Examples
--------
>>> psidx = ps.Index([1, 2, 3])
>>> psidx.is_type_compatible('integer')
True
>>> psidx = ps.Index([1.0, 2.0, 3.0])
>>> psidx.is_type_compatible('integer')
False
>>> psidx.is_type_compatible('floating')
True
"""
warnings.warn(
"Index.is_type_compatible is deprecated and will be removed in 4.0.0. "
"Use Index.isin instead.",
FutureWarning,
)
return kind == self.inferred_type

def dropna(self, how: str = "any") -> "Index":
"""
Return Index or MultiIndex without NA/NaN values
Expand Down
8 changes: 0 additions & 8 deletions python/pyspark/pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1267,14 +1267,6 @@ def inferred_type(self) -> str:
# Always returns "mixed" for MultiIndex
return "mixed"

@property
def asi8(self) -> None:
"""
Integer representation of the values.
"""
# Always returns None for MultiIndex
return None

def factorize(
self, sort: bool = True, na_sentinel: Optional[int] = -1
) -> Tuple["MultiIndex", pd.Index]:
Expand Down
139 changes: 0 additions & 139 deletions python/pyspark/pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1581,145 +1581,6 @@ def test_multiindex_from_frame(self):
psdf = ps.from_pandas(pdf)
self.assert_eq(ps.MultiIndex.from_frame(psdf), pd.MultiIndex.from_frame(pdf))

def test_is_type_compatible(self):
data_types = ["integer", "floating", "string", "boolean"]
# Integer
pidx = pd.Index([1, 2, 3])
psidx = ps.from_pandas(pidx)
# is_type_compatible is removed from pandas 2.0.0.
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
expected_results = [True, False, False, False]
for data_type, expected_result in zip(data_types, expected_results):
self.assert_eq(psidx.is_type_compatible(data_type), expected_result)
else:
for data_type in data_types:
self.assert_eq(
pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)
)

# Floating
pidx = pd.Index([1.0, 2.0, 3.0])
psidx = ps.from_pandas(pidx)
# is_type_compatible is removed from pandas 2.0.0.
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
expected_results = [False, True, False, False]
for data_type, expected_result in zip(data_types, expected_results):
self.assert_eq(psidx.is_type_compatible(data_type), expected_result)
else:
for data_type in data_types:
self.assert_eq(
pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)
)

# String
pidx = pd.Index(["a", "b", "c"])
psidx = ps.from_pandas(pidx)
# is_type_compatible is removed from pandas 2.0.0.
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
expected_results = [False, False, True, False]
for data_type, expected_result in zip(data_types, expected_results):
self.assert_eq(psidx.is_type_compatible(data_type), expected_result)
else:
for data_type in data_types:
self.assert_eq(
pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)
)

# Boolean
pidx = pd.Index([True, False, True, False])
psidx = ps.from_pandas(pidx)
# is_type_compatible is removed from pandas 2.0.0.
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
expected_results = [False, False, False, True]
for data_type, expected_result in zip(data_types, expected_results):
self.assert_eq(psidx.is_type_compatible(data_type), expected_result)
else:
for data_type in data_types:
self.assert_eq(
pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)
)

# MultiIndex
pmidx = pd.MultiIndex.from_tuples([("a", "x")])
psmidx = ps.from_pandas(pmidx)
# is_type_compatible is removed from pandas 2.0.0.
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
expected_results = [False, False, False, False]
for data_type, expected_result in zip(data_types, expected_results):
self.assert_eq(psmidx.is_type_compatible(data_type), expected_result)
else:
for data_type in data_types:
self.assert_eq(
pmidx.is_type_compatible(data_type), psmidx.is_type_compatible(data_type)
)

def test_asi8(self):
# Integer
pidx = pd.Index([1, 2, 3])
psidx = ps.from_pandas(pidx)
# asi8 is removed from pandas 2.0.0.
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
self.assert_eq(np.array(pidx), psidx.asi8)
self.assert_eq(np.array(pidx.astype("int")), psidx.astype("int").asi8)
self.assert_eq(np.array(pidx.astype("int16")), psidx.astype("int16").asi8)
self.assert_eq(np.array(pidx.astype("int8")), psidx.astype("int8").asi8)
else:
self.assert_eq(pidx.asi8, psidx.asi8)
self.assert_eq(pidx.astype("int").asi8, psidx.astype("int").asi8)
self.assert_eq(pidx.astype("int16").asi8, psidx.astype("int16").asi8)
self.assert_eq(pidx.astype("int8").asi8, psidx.astype("int8").asi8)

# Integer with missing value
pidx = pd.Index([1, 2, None, 4, 5])
psidx = ps.from_pandas(pidx)
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
self.assert_eq(None, psidx.asi8)
else:
self.assert_eq(pidx.asi8, psidx.asi8)

# Datetime
pidx = pd.date_range(end="1/1/2018", periods=3)
psidx = ps.from_pandas(pidx)
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
self.assert_eq(
np.array([1514592000000000000, 1514678400000000000, 1514764800000000000]),
psidx.asi8,
)
else:
self.assert_eq(pidx.asi8, psidx.asi8)

# Floating
pidx = pd.Index([1.0, 2.0, 3.0])
psidx = ps.from_pandas(pidx)
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
self.assert_eq(None, psidx.asi8)
else:
self.assert_eq(pidx.asi8, psidx.asi8)

# String
pidx = pd.Index(["a", "b", "c"])
psidx = ps.from_pandas(pidx)
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
self.assert_eq(None, psidx.asi8)
else:
self.assert_eq(pidx.asi8, psidx.asi8)

# Boolean
pidx = pd.Index([True, False, True, False])
psidx = ps.from_pandas(pidx)
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
self.assert_eq(None, psidx.asi8)
else:
self.assert_eq(pidx.asi8, psidx.asi8)

# MultiIndex
pmidx = pd.MultiIndex.from_tuples([(1, 2)])
psmidx = ps.from_pandas(pmidx)
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
self.assert_eq(None, psmidx.asi8)
else:
self.assert_eq(pmidx.asi8, psmidx.asi8)

def test_index_is_unique(self):
indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
names = [None, "ks", "ks", None]
Expand Down

0 comments on commit 69ba1bd

Please sign in to comment.