Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bumping pandas version from 1.5.0 to 2.x #373

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/run-tests
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Default environment variables
export TEST_SUITE="${TEST_SUITE:=oss}"
export PYTHON_VERSION="${PYTHON_VERSION:=3.9}"
export PANDAS_VERSION=${PANDAS_VERSION-1.5.0}
export PANDAS_VERSION=${PANDAS_VERSION-2.0.3}
export PYTHON_CONNECTION_CLASS="${PYTHON_CONNECTION_CLASS:=Urllib3HttpConnection}"
export CLUSTER="${1:-opensearch}"
export SECURE_INTEGRATION="${2:-true}"
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
- Replaced usage of `is_datetime_or_timedelta_dtype` with `is_timedelta64_dtype` and `is_datetime64_any_dtype` by @rawwar ([#316](https://github.com/opensearch-project/opensearch-py-ml/pull/316))
- use try-except-else block for handling unexpected exceptions during integration tests by @rawwar([#370](https://github.com/opensearch-project/opensearch-py-ml/pull/370))
- Removed pandas version pin in nox tests by @rawwar ([#368](https://github.com/opensearch-project/opensearch-py-ml/pull/368))
- Upgraded pandas version by @yerzhaisang ([#373](https://github.com/opensearch-project/opensearch-py-ml/pull/373))
- Removed pandas deprecated methods such as mad, _construct_axes_from_arguments and unused keyword arguments by @yerzhaisang ([#373](https://github.com/opensearch-project/opensearch-py-ml/pull/373))

### Fixed
- Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203))
Expand Down
Empty file added Command
Empty file.
Empty file added Session
Empty file.
6 changes: 2 additions & 4 deletions opensearch_py_ml/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,9 +424,7 @@ def drop(
axis = pd.DataFrame._get_axis_name(axis)
axes = {axis: labels}
elif index is not None or columns is not None:
axes, _ = pd.DataFrame()._construct_axes_from_arguments(
(index, columns), {}
)
axes = {"index": index, "columns": columns}
else:
raise ValueError(
"Need to specify at least one of 'labels', 'index' or 'columns'"
Expand Down Expand Up @@ -1355,7 +1353,7 @@ def to_csv(
"compression": compression,
"quoting": quoting,
"quotechar": quotechar,
"line_terminator": line_terminator,
# "line_terminator": line_terminator,
"chunksize": chunksize,
"date_format": date_format,
"doublequote": doublequote,
Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# Basic requirements
#
pandas>=1.5.2,<2
pandas>=2.0.3
matplotlib>=3.6.2,<4
numpy>=1.24.0,<2
opensearch-py>=2.2.0
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# Basic requirements
#
pandas>=1.5.2,<2
pandas>=2.0.3
matplotlib>=3.6.2,<4
numpy>=1.24.0,<2
opensearch-py>=2.2.0
Expand Down
17 changes: 17 additions & 0 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import os
from datetime import timedelta

import numpy as np
import pandas as pd
from pandas.testing import assert_frame_equal, assert_series_equal

Expand Down Expand Up @@ -151,3 +152,19 @@ def assert_almost_equal(left, right, **kwargs):
assert left is pd.NaT
else:
assert left == right, f"{left} != {right}"


def mad(x):
if isinstance(x, pd.Series):
if x.dtype == "<M8[ns]":
return pd.Timestamp("NaT")
elif x.dtype == object:
return np.nan
else:
numeric_columns = x.select_dtypes(include=["number", "bool"]).columns
x = x[numeric_columns]
return np.fabs(x - x.mean()).mean()


def quantile(x, numeric_only=None):
return x.quantile()
2 changes: 1 addition & 1 deletion tests/dataframe/test_describe_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_flights_describe(self):
pd_flights = self.pd_flights()
oml_flights = self.oml_flights()

pd_describe = pd_flights.describe()
pd_describe = pd_flights.drop("timestamp", axis=1).describe()
# We remove bool columns to match pandas output
oml_describe = oml_flights.describe().drop(
["Cancelled", "FlightDelay"], axis="columns"
Expand Down
17 changes: 13 additions & 4 deletions tests/dataframe/test_groupby_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import pytest
from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal

from tests.common import TestData
from tests.common import TestData, mad


class TestGroupbyDataFrame(TestData):
Expand Down Expand Up @@ -106,7 +106,16 @@ def test_groupby_aggs_mad_var_std(self, pd_agg, dropna):
pd_flights = self.pd_flights().filter(self.filter_data)
oml_flights = self.oml_flights().filter(self.filter_data)

pd_groupby = getattr(pd_flights.groupby("Cancelled", dropna=dropna), pd_agg)()
if pd_agg == "mad":
pd_groupby = (
pd_flights.groupby("Cancelled", dropna=dropna)
.apply(mad)
.drop("Cancelled", axis=1)
)
else:
pd_groupby = getattr(
pd_flights.groupby("Cancelled", dropna=dropna), pd_agg
)()
oml_groupby = getattr(oml_flights.groupby("Cancelled", dropna=dropna), pd_agg)(
numeric_only=True
)
Expand Down Expand Up @@ -224,14 +233,14 @@ def test_groupby_dataframe_mad(self):
pd_flights = self.pd_flights().filter(self.filter_data + ["DestCountry"])
oml_flights = self.oml_flights().filter(self.filter_data + ["DestCountry"])

pd_mad = pd_flights.groupby("DestCountry").mad()
pd_mad = pd_flights.groupby("DestCountry").apply(mad)
oml_mad = oml_flights.groupby("DestCountry").mad()

assert_index_equal(pd_mad.columns, oml_mad.columns)
assert_index_equal(pd_mad.index, oml_mad.index)
assert_series_equal(pd_mad.dtypes, oml_mad.dtypes)

pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", "mad"])
pd_min_mad = pd_flights.groupby("DestCountry").aggregate(["min", mad])
oml_min_mad = oml_flights.groupby("DestCountry").aggregate(["min", "mad"])

assert_index_equal(pd_min_mad.columns, oml_min_mad.columns)
Expand Down
21 changes: 14 additions & 7 deletions tests/dataframe/test_metrics_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
import pytest
from pandas.testing import assert_frame_equal, assert_series_equal

from tests.common import TestData, assert_almost_equal
from tests.common import TestData, assert_almost_equal, mad, quantile


class TestDataFrameMetrics(TestData):
Expand Down Expand Up @@ -81,9 +81,10 @@ def test_flights_extended_metrics(self):
logger.setLevel(logging.DEBUG)

for func in self.extended_funcs:
pd_metric = getattr(pd_flights, func)(
**({"numeric_only": True} if func != "mad" else {})
)
if func == "mad":
pd_metric = mad(pd_flights)
else:
pd_metric = getattr(pd_flights, func)(numeric_only=True)
oml_metric = getattr(oml_flights, func)(numeric_only=True)

pd_value = pd_metric["AvgTicketPrice"]
Expand All @@ -101,7 +102,10 @@ def test_flights_extended_metrics_nan(self):
]

for func in self.extended_funcs:
pd_metric = getattr(pd_flights_1, func)()
if func == "mad":
pd_metric = mad(pd_flights_1)
else:
pd_metric = getattr(pd_flights_1, func)()
oml_metric = getattr(oml_flights_1, func)(numeric_only=False)

assert_series_equal(pd_metric, oml_metric, check_exact=False)
Expand All @@ -111,7 +115,10 @@ def test_flights_extended_metrics_nan(self):
oml_flights_0 = oml_flights[oml_flights.FlightNum == "XXX"][["AvgTicketPrice"]]

for func in self.extended_funcs:
pd_metric = getattr(pd_flights_0, func)()
if func == "mad":
pd_metric = mad(pd_flights_0)
else:
pd_metric = getattr(pd_flights_0, func)()
oml_metric = getattr(oml_flights_0, func)(numeric_only=False)

assert_series_equal(pd_metric, oml_metric, check_exact=False)
Expand Down Expand Up @@ -498,7 +505,7 @@ def test_flights_agg_quantile(self, numeric_only):
["AvgTicketPrice", "FlightDelayMin", "dayOfWeek"]
)

pd_quantile = pd_flights.agg(["quantile", "min"], numeric_only=numeric_only)
pd_quantile = pd_flights.agg([quantile, "min"], numeric_only=numeric_only)
oml_quantile = oml_flights.agg(["quantile", "min"], numeric_only=numeric_only)

assert_frame_equal(
Expand Down
Loading