From 29308c5903792bd4be52782d1ce63b7ebd01b6b4 Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@apache.org>
Date: Fri, 15 Sep 2023 14:14:10 +0800
Subject: [PATCH] init

---
 python/docs/source/getting_started/install.rst        |  2 +-
 python/pyspark/ml/image.py                            | 10 +---------
 .../pandas/tests/computation/test_apply_func.py       |  3 +--
 python/pyspark/pandas/tests/test_typedef.py           |  3 +--
 python/pyspark/pandas/typedef/typehints.py            |  3 +--
 python/setup.py                                       | 11 ++++++-----
 6 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst
index c7584e05f995f..909f9cff3e38d 100644
--- a/python/docs/source/getting_started/install.rst
+++ b/python/docs/source/getting_started/install.rst
@@ -158,7 +158,7 @@ Package                    Supported version Note
 `py4j`                     >=0.10.9.7                Required
 `pandas`                   >=1.4.4                   Required for pandas API on Spark and Spark Connect; Optional for Spark SQL
 `pyarrow`                  >=4.0.0                   Required for pandas API on Spark and Spark Connect; Optional for Spark SQL
-`numpy`                    >=1.15                    Required for pandas API on Spark and MLLib DataFrame-based API; Optional for Spark SQL
+`numpy`                    >=1.21                    Required for pandas API on Spark and MLLib DataFrame-based API; Optional for Spark SQL
 `grpcio`                   >=1.48,<1.57              Required for Spark Connect
 `grpcio-status`            >=1.48,<1.57              Required for Spark Connect
 `googleapis-common-protos` ==1.56.4                  Required for Spark Connect
diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py
index 6dc97ac246ab3..329a56459e6c8 100644
--- a/python/pyspark/ml/image.py
+++ b/python/pyspark/ml/image.py
@@ -28,7 +28,6 @@
 from typing import Any, Dict, List, NoReturn, Optional, cast
 
 import numpy as np
-from distutils.version import LooseVersion
 
 from pyspark import SparkContext
 from pyspark.sql.types import Row, StructType, _create_row, _parse_datatype_json_string
@@ -225,14 +224,7 @@ def toImage(self, array: np.ndarray, origin: str = "") -> Row:
         else:
             raise ValueError("Invalid number of channels")
 
-        # Running `bytearray(numpy.array([1]))` fails in specific Python versions
-        # with a specific Numpy version, for example in Python 3.6.0 and NumPy 1.13.3.
-        # Here, it avoids it by converting it to bytes.
-        if LooseVersion(np.__version__) >= LooseVersion("1.9"):
-            data = bytearray(array.astype(dtype=np.uint8).ravel().tobytes())
-        else:
-            # Numpy prior to 1.9 don't have `tobytes` method.
-            data = bytearray(array.astype(dtype=np.uint8).ravel())
+        data = bytearray(array.astype(dtype=np.uint8).ravel().tobytes())
 
         # Creating new Row with _create_row(), because Row(name = value, ... )
         # orders fields by name, which conflicts with expected schema order
diff --git a/python/pyspark/pandas/tests/computation/test_apply_func.py b/python/pyspark/pandas/tests/computation/test_apply_func.py
index 37cc4a4188f64..93d9d56a479af 100644
--- a/python/pyspark/pandas/tests/computation/test_apply_func.py
+++ b/python/pyspark/pandas/tests/computation/test_apply_func.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 from datetime import datetime
-from distutils.version import LooseVersion
 import sys
 import unittest
 from typing import List
@@ -254,7 +253,7 @@ def identify3(x) -> ps.DataFrame[float, [int, List[int]]]:
         self.assert_eq(actual, pdf)
 
         # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+
-        if sys.version_info >= (3, 8) and LooseVersion(np.__version__) >= LooseVersion("1.21"):
+        if sys.version_info >= (3, 8):
             import numpy.typing as ntp
 
             psdf = ps.from_pandas(pdf)
diff --git a/python/pyspark/pandas/tests/test_typedef.py b/python/pyspark/pandas/tests/test_typedef.py
index 97e400d42444e..52913fb65f098 100644
--- a/python/pyspark/pandas/tests/test_typedef.py
+++ b/python/pyspark/pandas/tests/test_typedef.py
@@ -19,7 +19,6 @@
 import unittest
 import datetime
 import decimal
-from distutils.version import LooseVersion
 from typing import List
 
 import pandas
@@ -362,7 +361,7 @@ def test_as_spark_type_pandas_on_spark_dtype(self):
             )
 
             # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+
-            if sys.version_info >= (3, 8) and LooseVersion(np.__version__) >= LooseVersion("1.21"):
+            if sys.version_info >= (3, 8):
                 import numpy.typing as ntp
 
                 self.assertEqual(
diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py
index 413ef20dae0dd..e66b08b9f0b2c 100644
--- a/python/pyspark/pandas/typedef/typehints.py
+++ b/python/pyspark/pandas/typedef/typehints.py
@@ -23,7 +23,6 @@
 import sys
 import typing
 from collections.abc import Iterable
-from distutils.version import LooseVersion
 from inspect import isclass
 from typing import Any, Callable, Generic, List, Tuple, Union, Type, get_type_hints
 
@@ -149,7 +148,7 @@ def as_spark_type(
     - Python3's typing system
     """
     # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+
-    if sys.version_info >= (3, 8) and LooseVersion(np.__version__) >= LooseVersion("1.21"):
+    if sys.version_info >= (3, 8):
         if (
             hasattr(tpe, "__origin__")
             and tpe.__origin__ is np.ndarray  # type: ignore[union-attr]
diff --git a/python/setup.py b/python/setup.py
index 1cb55abfef075..05658c6a6a8b2 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -131,6 +131,7 @@ def _supports_symlinks():
 # binary format protocol with the Java version, see ARROW_HOME/format/* for specifications.
 # Also don't forget to update python/docs/source/getting_started/install.rst.
 _minimum_pandas_version = "1.4.4"
+_minimum_numpy_version = "1.21"
 _minimum_pyarrow_version = "4.0.0"
 _minimum_grpc_version = "1.56.0"
 _minimum_googleapis_common_protos_version = "1.56.4"
@@ -307,17 +308,17 @@ def run(self):
         # if you're updating the versions or dependencies.
         install_requires=["py4j==0.10.9.7"],
         extras_require={
-            "ml": ["numpy>=1.15"],
-            "mllib": ["numpy>=1.15"],
+            "ml": ["numpy>=%s" % _minimum_numpy_version],
+            "mllib": ["numpy>=%s" % _minimum_numpy_version],
             "sql": [
                 "pandas>=%s" % _minimum_pandas_version,
                 "pyarrow>=%s" % _minimum_pyarrow_version,
-                "numpy>=1.15",
+                "numpy>=%s" % _minimum_numpy_version,
             ],
             "pandas_on_spark": [
                 "pandas>=%s" % _minimum_pandas_version,
                 "pyarrow>=%s" % _minimum_pyarrow_version,
-                "numpy>=1.15",
+                "numpy>=%s" % _minimum_numpy_version,
             ],
             "connect": [
                 "pandas>=%s" % _minimum_pandas_version,
@@ -325,7 +326,7 @@ def run(self):
                 "grpcio>=%s" % _minimum_grpc_version,
                 "grpcio-status>=%s" % _minimum_grpc_version,
                 "googleapis-common-protos>=%s" % _minimum_googleapis_common_protos_version,
-                "numpy>=1.15",
+                "numpy>=%s" % _minimum_numpy_version,
             ],
         },
         python_requires=">=3.8",