From 29308c5903792bd4be52782d1ce63b7ebd01b6b4 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Fri, 15 Sep 2023 14:14:10 +0800 Subject: [PATCH] init --- python/docs/source/getting_started/install.rst | 2 +- python/pyspark/ml/image.py | 10 +--------- .../pandas/tests/computation/test_apply_func.py | 3 +-- python/pyspark/pandas/tests/test_typedef.py | 3 +-- python/pyspark/pandas/typedef/typehints.py | 3 +-- python/setup.py | 11 ++++++----- 6 files changed, 11 insertions(+), 21 deletions(-) diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index c7584e05f995f..909f9cff3e38d 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -158,7 +158,7 @@ Package Supported version Note `py4j` >=0.10.9.7 Required `pandas` >=1.4.4 Required for pandas API on Spark and Spark Connect; Optional for Spark SQL `pyarrow` >=4.0.0 Required for pandas API on Spark and Spark Connect; Optional for Spark SQL -`numpy` >=1.15 Required for pandas API on Spark and MLLib DataFrame-based API; Optional for Spark SQL +`numpy` >=1.21 Required for pandas API on Spark and MLLib DataFrame-based API; Optional for Spark SQL `grpcio` >=1.48,<1.57 Required for Spark Connect `grpcio-status` >=1.48,<1.57 Required for Spark Connect `googleapis-common-protos` ==1.56.4 Required for Spark Connect diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py index 6dc97ac246ab3..329a56459e6c8 100644 --- a/python/pyspark/ml/image.py +++ b/python/pyspark/ml/image.py @@ -28,7 +28,6 @@ from typing import Any, Dict, List, NoReturn, Optional, cast import numpy as np -from distutils.version import LooseVersion from pyspark import SparkContext from pyspark.sql.types import Row, StructType, _create_row, _parse_datatype_json_string @@ -225,14 +224,7 @@ def toImage(self, array: np.ndarray, origin: str = "") -> Row: else: raise ValueError("Invalid number of channels") - # Running `bytearray(numpy.array([1]))` fails in specific Python versions - # with a specific Numpy version, for example in Python 3.6.0 and NumPy 1.13.3. - # Here, it avoids it by converting it to bytes. - if LooseVersion(np.__version__) >= LooseVersion("1.9"): - data = bytearray(array.astype(dtype=np.uint8).ravel().tobytes()) - else: - # Numpy prior to 1.9 don't have `tobytes` method. - data = bytearray(array.astype(dtype=np.uint8).ravel()) + data = bytearray(array.astype(dtype=np.uint8).ravel().tobytes()) # Creating new Row with _create_row(), because Row(name = value, ... ) # orders fields by name, which conflicts with expected schema order diff --git a/python/pyspark/pandas/tests/computation/test_apply_func.py b/python/pyspark/pandas/tests/computation/test_apply_func.py index 37cc4a4188f64..93d9d56a479af 100644 --- a/python/pyspark/pandas/tests/computation/test_apply_func.py +++ b/python/pyspark/pandas/tests/computation/test_apply_func.py @@ -15,7 +15,6 @@ # limitations under the License. # from datetime import datetime -from distutils.version import LooseVersion import sys import unittest from typing import List @@ -254,7 +253,7 @@ def identify3(x) -> ps.DataFrame[float, [int, List[int]]]: self.assert_eq(actual, pdf) # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+ - if sys.version_info >= (3, 8) and LooseVersion(np.__version__) >= LooseVersion("1.21"): + if sys.version_info >= (3, 8): import numpy.typing as ntp psdf = ps.from_pandas(pdf) diff --git a/python/pyspark/pandas/tests/test_typedef.py b/python/pyspark/pandas/tests/test_typedef.py index 97e400d42444e..52913fb65f098 100644 --- a/python/pyspark/pandas/tests/test_typedef.py +++ b/python/pyspark/pandas/tests/test_typedef.py @@ -19,7 +19,6 @@ import unittest import datetime import decimal -from distutils.version import LooseVersion from typing import List import pandas @@ -362,7 +361,7 @@ def test_as_spark_type_pandas_on_spark_dtype(self): ) # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+ - if sys.version_info >= (3, 8) and LooseVersion(np.__version__) >= LooseVersion("1.21"): + if sys.version_info >= (3, 8): import numpy.typing as ntp self.assertEqual( diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py index 413ef20dae0dd..e66b08b9f0b2c 100644 --- a/python/pyspark/pandas/typedef/typehints.py +++ b/python/pyspark/pandas/typedef/typehints.py @@ -23,7 +23,6 @@ import sys import typing from collections.abc import Iterable -from distutils.version import LooseVersion from inspect import isclass from typing import Any, Callable, Generic, List, Tuple, Union, Type, get_type_hints @@ -149,7 +148,7 @@ def as_spark_type( - Python3's typing system """ # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+ - if sys.version_info >= (3, 8) and LooseVersion(np.__version__) >= LooseVersion("1.21"): + if sys.version_info >= (3, 8): if ( hasattr(tpe, "__origin__") and tpe.__origin__ is np.ndarray # type: ignore[union-attr] diff --git a/python/setup.py b/python/setup.py index 1cb55abfef075..05658c6a6a8b2 100755 --- a/python/setup.py +++ b/python/setup.py @@ -131,6 +131,7 @@ def _supports_symlinks(): # binary format protocol with the Java version, see ARROW_HOME/format/* for specifications. # Also don't forget to update python/docs/source/getting_started/install.rst. _minimum_pandas_version = "1.4.4" +_minimum_numpy_version = "1.21" _minimum_pyarrow_version = "4.0.0" _minimum_grpc_version = "1.56.0" _minimum_googleapis_common_protos_version = "1.56.4" @@ -307,17 +308,17 @@ def run(self): # if you're updating the versions or dependencies. install_requires=["py4j==0.10.9.7"], extras_require={ - "ml": ["numpy>=1.15"], - "mllib": ["numpy>=1.15"], + "ml": ["numpy>=%s" % _minimum_numpy_version], + "mllib": ["numpy>=%s" % _minimum_numpy_version], "sql": [ "pandas>=%s" % _minimum_pandas_version, "pyarrow>=%s" % _minimum_pyarrow_version, - "numpy>=1.15", + "numpy>=%s" % _minimum_numpy_version, ], "pandas_on_spark": [ "pandas>=%s" % _minimum_pandas_version, "pyarrow>=%s" % _minimum_pyarrow_version, - "numpy>=1.15", + "numpy>=%s" % _minimum_numpy_version, ], "connect": [ "pandas>=%s" % _minimum_pandas_version, @@ -325,7 +326,7 @@ def run(self): "grpcio>=%s" % _minimum_grpc_version, "grpcio-status>=%s" % _minimum_grpc_version, "googleapis-common-protos>=%s" % _minimum_googleapis_common_protos_version, - "numpy>=1.15", + "numpy>=%s" % _minimum_numpy_version, ], }, python_requires=">=3.8",