Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix test suite #1314

Merged
merged 14 commits into from
Feb 13, 2024
4 changes: 4 additions & 0 deletions datashader/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,10 @@
dtype = np.dtype(object) if dtype is None else np.dtype(dtype)
return np.asarray(self.tolist(), dtype=dtype)

def duplicated(self, *args, **kwargs):
msg = "duplicated is not implemented for RaggedArray"
raise NotImplementedError(msg)

Check warning on line 654 in datashader/datatypes.py

View check run for this annotation

Codecov / codecov/patch

datashader/datatypes.py#L653-L654

Added lines #L653 - L654 were not covered by tests


@jit(nopython=True, nogil=True)
def _eq_ragged_ragged(start_indices1,
Expand Down
114 changes: 57 additions & 57 deletions datashader/tests/test_dask.py

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions datashader/tests/test_datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,6 +835,12 @@ def test_sort_values_frame(self):
def test_where_series(self):
pass

@pytest.mark.xfail(reason="not currently supported")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we instead pin to pandas !=2.2, expecting them to fix it in the next release?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The line you have commented on is a new test for ExtensionArray, which was added in Pandas 2.2.

I have updated the skip for the upstream regression.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused, since it says @pytest.mark.xfail(reason="not currently supported") ("currently"), but in any case, sounds like you're on it!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

duplicated has likely never worked for RaggedArray; I have added a NotImplementedError for it in the class. Before that, this would give a wrong result.
image

import pandas as pd
from datashader.datatypes import RaggedArray

data = RaggedArray([[0, 1], [1, 2, 3, 4], [], [-1, -2], []], dtype='float64')
arr = data.take([0, 1, 0, 1])
arr.duplicated('first')

arr = pd.array([0, 1, 0, 1])
arr.duplicated('first')

Pandas use a hashtable to check if something is duplicated, but it is not designed to work with RaggedArray (which makes sense). I don't think it is impossible to support this, which is why I have added "currently". I hope that clarifies things.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, thanks! Maybe the way to clarify it would be to state what doesn't currently support it, e.g. "not currently supported by Pandas", "not currently supported by RaggedArray", or "not currently supported by Datshader".

def test_duplicated(self, data):
# Added in Pandas 2.2
# https://github.com/pandas-dev/pandas/pull/55255
super().test_duplicated(data)

class TestRaggedPrinting(eb.BasePrintingTests):
@pytest.mark.skip(reason="Can't autoconvert ragged array to numpy array")
def test_dataframe_repr(self):
Expand Down Expand Up @@ -873,6 +879,11 @@ def test_fillna_no_op_returns_copy(self):
def test_fillna_series_method(self):
pass

@pytest.mark.skip(reason="Can't fill with nested sequences")
def test_ffill_limit_area(self):
# Added in Pandas 2.2
pass


class TestRaggedReshaping(eb.BaseReshapingTests):
@pytest.mark.skip(reason="__setitem__ not supported")
Expand All @@ -886,3 +897,15 @@ def test_transpose(self):
@pytest.mark.skip(reason="transpose with numpy array elements seems not supported")
def test_transpose_frame(self):
pass

@pytest.mark.skipif(
Version(pd.__version__) == Version("2.2.0"), reason="Regression in Pandas 2.2"
)
def test_merge_on_extension_array(self, data):
super().test_merge_on_extension_array(data)

@pytest.mark.skipif(
Version(pd.__version__) == Version("2.2.0"), reason="Regression in Pandas 2.2"
)
def test_merge_on_extension_array_duplicates(self, data):
super().test_merge_on_extension_array_duplicates(data)
6 changes: 3 additions & 3 deletions datashader/tests/test_geopandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def test_lines_spatialpandas(geom_type, explode, use_boundary, npartitions):
def test_points_geopandas(geom_type):
df = geopandas.read_file(geodatasets.get_path("nybb"))

df["geometry"] = df["geometry"].sample_points(100, seed=93814) # multipoint
df["geometry"] = df["geometry"].sample_points(100, rng=93814) # multipoint
if geom_type == "point":
df = df.explode(index_parts=False) # Multipoint -> point.
unique_geom_type = df["geometry"].geom_type.unique()
Expand All @@ -218,7 +218,7 @@ def test_points_geopandas(geom_type):
def test_points_dask_geopandas(geom_type, npartitions):
df = geopandas.read_file(geodatasets.get_path("nybb"))

df["geometry"] = df["geometry"].sample_points(100, seed=93814) # multipoint
df["geometry"] = df["geometry"].sample_points(100, rng=93814) # multipoint
if geom_type == "point":
df = df.explode(index_parts=False) # Multipoint -> point.
unique_geom_type = df["geometry"].geom_type.unique()
Expand All @@ -240,7 +240,7 @@ def test_points_dask_geopandas(geom_type, npartitions):
def test_points_spatialpandas(geom_type, npartitions):
df = geopandas.read_file(geodatasets.get_path("nybb"))

df["geometry"] = df["geometry"].sample_points(100, seed=93814) # multipoint
df["geometry"] = df["geometry"].sample_points(100, rng=93814) # multipoint
if geom_type == "point":
df = df.explode(index_parts=False) # Multipoint -> point.
unique_geom_type = df["geometry"].geom_type.unique()
Expand Down
3 changes: 2 additions & 1 deletion datashader/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2734,7 +2734,8 @@ def test_line_antialias():
numerator = np.nan_to_num(line_antialias_sol_0_intersect) + np.nan_to_num(line_antialias_sol_1)
denom = np.nan_to_num(line_antialias_sol_count_ignore_aa_0) + \
np.nan_to_num(line_antialias_sol_count_ignore_aa_1)
sol = 3*numerator / denom
with np.errstate(invalid='ignore'): # Dividing by zero is expected
sol = 3*numerator / denom
assert_eq_ndarray(agg.data, sol, close=True)

agg = cvs.line(agg=ds._min_row_index(), **kwargs)
Expand Down
2 changes: 1 addition & 1 deletion datashader/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
df = pd.DataFrame({'x': np.array(([0.] * 10 + [1] * 10)),
'y': np.array(([0.] * 5 + [1] * 5 + [0] * 5 + [1] * 5)),
'f64': np.arange(20, dtype='f8')})
df.f64.iloc[2] = np.nan
df.loc['f64', 2] = np.nan

cvs = ds.Canvas(plot_width=2, plot_height=2, x_range=(0, 1), y_range=(0, 1))
cvs10 = ds.Canvas(plot_width=10, plot_height=10, x_range=(0, 1), y_range=(0, 1))
Expand Down
13 changes: 13 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,16 @@ target-version = "py39"

[tool.ruff.per-file-ignores]
"test_mpl_ext.py" = ["E402"] # Module level import not at top of file

[tool.pytest.ini_options]
addopts = ["--pyargs", "--doctest-modules", "--doctest-ignore-import-errors", "--strict-config", "--strict-markers", "--color=yes"]
norecursedirs = 'doc .git dist build _build .ipynb_checkpoints'
minversion = "7"
xfail_strict = true
log_cli_level = "INFO"
# skipping any notebooks that require extra deps
nbsmoke_skip_run = ".*tiling.ipynb$\n.*streaming-aggregation.ipynb$\n.*8_Geography.ipynb$"
filterwarnings = [
"ignore:Passing a (SingleBlockManager|BlockManager) to (Series|GeoSeries|DataFrame|GeoDataFrame) is deprecated:DeprecationWarning", # https://github.com/holoviz/spatialpandas/issues/137
"ignore:Accessing the underlying geometries through the `.data`:DeprecationWarning:dask_geopandas.core", # https://github.com/geopandas/dask-geopandas/issues/264
]
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,13 @@
'tests': geopandas + [
'codecov',
'geodatasets',
'fastparquet', # optional dependency
'flake8',
'nbconvert',
'nbformat',
'nbsmoke[verify] >0.5',
'netcdf4',
'pyarrow',
'pytest',
'pytest <8', # Fails lint with IPynbFile is deprecated
'pytest-benchmark',
'pytest-cov',
'rasterio',
Expand Down
8 changes: 0 additions & 8 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,6 @@ deps = unit: {[_unit]deps}
examples_extra: {[_examples_extra]deps}
all: {[_all]deps}

[pytest]
addopts = -v --pyargs --doctest-modules --doctest-ignore-import-errors
norecursedirs = doc .git dist build _build .ipynb_checkpoints
# skipping any notebooks that require extra deps
nbsmoke_skip_run = .*tiling.ipynb$
.*streaming-aggregation.ipynb$
.*8_Geography.ipynb$

[flake8]
include = *.py
# run_tests.py is generated by conda build, which appears to have a
Expand Down
Loading