From b4578b2d7f526bb965f3e30c5813c360c29b8164 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwe.korn@quantco.com>
Date: Fri, 20 Mar 2020 19:01:28 +0100
Subject: [PATCH] =?UTF-8?q?ARROW-8175:=20[Python]=C2=A0Setup=20type=20chec?=
 =?UTF-8?q?king=20with=20mypy?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ci/scripts/python_test.sh                     |  1 +
 python/pyarrow/__init__.py                    |  3 +-
 python/pyarrow/compat.py                      |  5 +-
 python/pyarrow/filesystem.py                  |  3 +-
 python/pyarrow/pandas_compat.py               |  3 +-
 python/pyarrow/plasma.py                      |  8 +-
 python/pyarrow/tests/strategies.py            |  6 +-
 python/pyarrow/tests/test_cuda.py             | 86 ++++++++++---------
 .../pyarrow/tests/test_cuda_numba_interop.py  |  4 +-
 python/pyarrow/tests/test_cython.py           |  2 +-
 python/pyarrow/tests/test_dataset.py          |  2 +-
 python/pyarrow/tests/test_flight.py           |  2 +-
 python/pyarrow/tests/test_parquet.py          |  2 +-
 python/pyarrow/tests/test_serialization.py    |  2 +-
 python/requirements-test.txt                  |  1 +
 python/setup.cfg                              |  6 ++
 16 files changed, 81 insertions(+), 55 deletions(-)

diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh
index 6f961d2f8e0f9..b73cea7aa2386 100755
--- a/ci/scripts/python_test.sh
+++ b/ci/scripts/python_test.sh
@@ -30,3 +30,4 @@ export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH}
 export PYTHONDEVMODE=1
 
 pytest -r s --pyargs pyarrow
+mypy pyarrow
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 6f76508ed7598..e968dda2a85bc 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -17,6 +17,7 @@
 
 # flake8: noqa
 
+from typing import Optional
 
 import os as _os
 import sys as _sys
@@ -40,7 +41,7 @@ def parse_git(root, **kwargs):
         __version__ = setuptools_scm.get_version('../',
                                                  parse=parse_git)
     except ImportError:
-        __version__ = None
+        __version__ = None  # type: ignore
 
 
 import pyarrow.compat as compat
diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py
index 0890ade0577ea..970fcace40e3d 100644
--- a/python/pyarrow/compat.py
+++ b/python/pyarrow/compat.py
@@ -26,9 +26,10 @@
 try:
     import pickle5 as builtin_pickle
 except ImportError:
-    import pickle as builtin_pickle
+    import pickle as builtin_pickle  # type: ignore
 
 from collections.abc import Iterable, Mapping, Sequence
+from typing import List
 
 def guid():
     from uuid import uuid4
@@ -122,4 +123,4 @@ def descr_to_dtype(descr):
         return np.dtype({'names': names, 'formats': formats, 'titles': titles,
                             'offsets': offsets, 'itemsize': offset})
 
-__all__ = []
+__all__: List[str] = []
diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py
index c46f1168e3342..95643efe979b2 100644
--- a/python/pyarrow/filesystem.py
+++ b/python/pyarrow/filesystem.py
@@ -20,6 +20,7 @@
 import inspect
 import posixpath
 import urllib.parse
+from typing import Optional
 
 from os.path import join as pjoin
 
@@ -194,7 +195,7 @@ def pathsep(self):
 
 class LocalFileSystem(FileSystem):
 
-    _instance = None
+    _instance: Optional["LocalFileSystem"] = None
 
     @classmethod
     def get_instance(cls):
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 749a62357f390..2a92097d59fe2 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -18,6 +18,7 @@
 
 import ast
 from itertools import zip_longest
+from typing import Any, Dict
 import json
 import operator
 import re
@@ -32,7 +33,7 @@
                             frombytes, Sequence)
 
 
-_logical_type_map = {}
+_logical_type_map: Dict[Any, str] = {}
 
 
 def get_logical_type_map():
diff --git a/python/pyarrow/plasma.py b/python/pyarrow/plasma.py
index 5bfa0b47dddab..d99a26b92a4c1 100644
--- a/python/pyarrow/plasma.py
+++ b/python/pyarrow/plasma.py
@@ -37,10 +37,14 @@
 # the function build_plasma_tensorflow_op can be used to compile it.
 
 
-TF_PLASMA_OP_PATH = os.path.join(pa.__path__[0], "tensorflow", "plasma_op.so")
+TF_PLASMA_OP_PATH = os.path.join(
+    pa.__path__[0],  # type: ignore
+    "tensorflow",
+    "plasma_op.so",
+)
 
 
-tf_plasma_op = None
+tf_plasma_op = None  # type: ignore
 
 
 def load_plasma_tensorflow_op():
diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py
index 2d5808056d306..784bc7ca5d5f4 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from typing import Any
+
 import pytz
 import hypothesis as h
 import hypothesis.strategies as st
@@ -60,7 +62,7 @@
     pa.float32(),
     pa.float64()
 ])
-decimal_type = st.builds(
+decimal_type: Any = st.builds(
     pa.decimal128,
     precision=st.integers(min_value=1, max_value=38),
     scale=st.integers(min_value=1, max_value=38)
@@ -77,7 +79,7 @@
     pa.time64('us'),
     pa.time64('ns')
 ])
-timestamp_types = st.builds(
+timestamp_types: Any = st.builds(
     pa.timestamp,
     unit=st.sampled_from(['s', 'ms', 'us', 'ns']),
     tz=tzst.timezones()
diff --git a/python/pyarrow/tests/test_cuda.py b/python/pyarrow/tests/test_cuda.py
index 0e4d3c49893c3..1aaa03194cf35 100644
--- a/python/pyarrow/tests/test_cuda.py
+++ b/python/pyarrow/tests/test_cuda.py
@@ -39,20 +39,18 @@
     not has_ipc_support,
     reason='CUDA IPC not supported in platform `%s`' % (platform))
 
-global_context = None  # for flake8
-global_context1 = None  # for flake8
 
+@pytest.fixture(scope="module")
+def global_context():
+    return cuda.Context(0)
 
-def setup_module(module):
-    module.global_context = cuda.Context(0)
-    module.global_context1 = cuda.Context(cuda.Context.get_num_devices() - 1)
 
+@pytest.fixture(scope="module")
+def global_context1():
+    return cuda.Context(cuda.Context.get_num_devices() - 1)
 
-def teardown_module(module):
-    del module.global_context
 
-
-def test_Context():
+def test_Context(global_context, global_context1):
     assert cuda.Context.get_num_devices() > 0
     assert global_context.device_number == 0
     assert global_context1.device_number == cuda.Context.get_num_devices() - 1
@@ -74,7 +72,7 @@ def test_manage_allocate_free_host(size):
     assert buf.size == size
 
 
-def test_context_allocate_del():
+def test_context_allocate_del(global_context):
     bytes_allocated = global_context.bytes_allocated
     cudabuf = global_context.new_buffer(128)
     assert global_context.bytes_allocated == bytes_allocated + 128
@@ -82,7 +80,7 @@ def test_context_allocate_del():
     assert global_context.bytes_allocated == bytes_allocated
 
 
-def make_random_buffer(size, target='host'):
+def make_random_buffer(size, target='host', context=None):
     """Return a host or device buffer with random data.
     """
     if target == 'host':
@@ -97,8 +95,8 @@ def make_random_buffer(size, target='host'):
         np.testing.assert_equal(arr, arr_)
         return arr, buf
     elif target == 'device':
-        arr, buf = make_random_buffer(size, target='host')
-        dbuf = global_context.new_buffer(size)
+        arr, buf = make_random_buffer(size, target='host', context=context)
+        dbuf = context.new_buffer(size)
         assert dbuf.size == size
         dbuf.copy_from_host(buf, position=0, nbytes=size)
         return arr, dbuf
@@ -106,9 +104,9 @@ def make_random_buffer(size, target='host'):
 
 
 @pytest.mark.parametrize("size", [0, 1, 1000])
-def test_context_device_buffer(size):
+def test_context_device_buffer(size, global_context):
     # Creating device buffer from host buffer;
-    arr, buf = make_random_buffer(size)
+    arr, buf = make_random_buffer(size, context=global_context)
     cudabuf = global_context.buffer_from_data(buf)
     assert cudabuf.size == size
     arr2 = np.frombuffer(cudabuf.copy_to_host(), dtype=np.uint8)
@@ -234,9 +232,9 @@ def test_context_device_buffer(size):
 
 
 @pytest.mark.parametrize("size", [0, 1, 1000])
-def test_context_from_object(size):
+def test_context_from_object(size, global_context):
     ctx = global_context
-    arr, cbuf = make_random_buffer(size, target='device')
+    arr, cbuf = make_random_buffer(size, target='device', contect=ctx)
     dtype = arr.dtype
 
     # Creating device buffer from a CUDA host buffer
@@ -265,7 +263,7 @@ def test_context_from_object(size):
         ctx.buffer_from_object(np.array([1, 2, 3]))
 
 
-def test_foreign_buffer():
+def test_foreign_buffer(global_context):
     ctx = global_context
     dtype = np.dtype(np.uint8)
     size = 10
@@ -293,8 +291,8 @@ def test_foreign_buffer():
 
 
 @pytest.mark.parametrize("size", [0, 1, 1000])
-def test_CudaBuffer(size):
-    arr, buf = make_random_buffer(size)
+def test_CudaBuffer(size, global_context):
+    arr, buf = make_random_buffer(size, context=global_context)
     assert arr.tobytes() == buf.to_pybytes()
     cbuf = global_context.buffer_from_data(buf)
     assert cbuf.size == size
@@ -321,8 +319,8 @@ def test_CudaBuffer(size):
 
 
 @pytest.mark.parametrize("size", [0, 1, 1000])
-def test_HostBuffer(size):
-    arr, buf = make_random_buffer(size)
+def test_HostBuffer(size, global_context):
+    arr, buf = make_random_buffer(size, context=global_context)
     assert arr.tobytes() == buf.to_pybytes()
     hbuf = cuda.new_host_buffer(size)
     np.frombuffer(hbuf, dtype=np.uint8)[:] = arr
@@ -348,7 +346,7 @@ def test_HostBuffer(size):
 
 
 @pytest.mark.parametrize("size", [0, 1, 1000])
-def test_copy_from_to_host(size):
+def test_copy_from_to_host(size, global_context):
 
     # Create a buffer in host containing range(size)
     buf = pa.allocate_buffer(size, resizable=True)  # in host
@@ -374,8 +372,9 @@ def test_copy_from_to_host(size):
 
 
 @pytest.mark.parametrize("size", [0, 1, 1000])
-def test_copy_to_host(size):
-    arr, dbuf = make_random_buffer(size, target='device')
+def test_copy_to_host(size, global_context):
+    arr, dbuf = make_random_buffer(size, target='device',
+                                   context=global_context)
 
     buf = dbuf.copy_to_host()
     assert buf.is_cpu
@@ -439,8 +438,9 @@ def test_copy_to_host(size):
 
 @pytest.mark.parametrize("dest_ctx", ['same', 'another'])
 @pytest.mark.parametrize("size", [0, 1, 1000])
-def test_copy_from_device(dest_ctx, size):
-    arr, buf = make_random_buffer(size=size, target='device')
+def test_copy_from_device(dest_ctx, size, global_context):
+    arr, buf = make_random_buffer(size=size, target='device',
+                                  context=global_context)
     lst = arr.tolist()
     if dest_ctx == 'another':
         dest_ctx = global_context1
@@ -489,8 +489,9 @@ def put(*args, **kwargs):
 
 
 @pytest.mark.parametrize("size", [0, 1, 1000])
-def test_copy_from_host(size):
-    arr, buf = make_random_buffer(size=size, target='host')
+def test_copy_from_host(size, global_context):
+    arr, buf = make_random_buffer(size=size, target='host',
+                                  context=global_context)
     lst = arr.tolist()
     dbuf = global_context.new_buffer(size)
 
@@ -532,7 +533,7 @@ def put(*args, **kwargs):
             put(position=position, nbytes=nbytes)
 
 
-def test_BufferWriter():
+def test_BufferWriter(global_context):
     def allocate(size):
         cbuf = global_context.new_buffer(size)
         writer = cuda.BufferWriter(cbuf)
@@ -540,7 +541,8 @@ def allocate(size):
 
     def test_writes(total_size, chunksize, buffer_size=0):
         cbuf, writer = allocate(total_size)
-        arr, buf = make_random_buffer(size=total_size, target='host')
+        arr, buf = make_random_buffer(size=total_size, target='host',
+                                      contect=global_context)
 
         if buffer_size > 0:
             writer.buffer_size = buffer_size
@@ -583,12 +585,13 @@ def test_writes(total_size, chunksize, buffer_size=0):
     np.testing.assert_equal(arr[75:], np.arange(25, dtype=np.uint8))
 
 
-def test_BufferWriter_edge_cases():
+def test_BufferWriter_edge_cases(global_context):
     # edge cases, see cuda-test.cc for more information:
     size = 1000
     cbuf = global_context.new_buffer(size)
     writer = cuda.BufferWriter(cbuf)
-    arr, buf = make_random_buffer(size=size, target='host')
+    arr, buf = make_random_buffer(size=size, target='host',
+                                  context=global_context)
 
     assert writer.buffer_size == 0
     writer.buffer_size = 100
@@ -619,9 +622,10 @@ def test_BufferWriter_edge_cases():
     np.testing.assert_equal(arr, arr2)
 
 
-def test_BufferReader():
+def test_BufferReader(global_context):
     size = 1000
-    arr, cbuf = make_random_buffer(size=size, target='device')
+    arr, cbuf = make_random_buffer(size=size, target='device',
+                                   context=global_context)
 
     reader = cuda.BufferReader(cbuf)
     reader.seek(950)
@@ -645,8 +649,9 @@ def test_BufferReader():
     np.testing.assert_equal(arr, arr2)
 
 
-def test_BufferReader_zero_size():
-    arr, cbuf = make_random_buffer(size=0, target='device')
+def test_BufferReader_zero_size(global_context):
+    arr, cbuf = make_random_buffer(size=0, target='device',
+                                   context=global_context1)
     reader = cuda.BufferReader(cbuf)
     reader.seek(0)
     data = reader.read()
@@ -666,7 +671,7 @@ def make_recordbatch(length):
     return batch
 
 
-def test_batch_serialize():
+def test_batch_serialize(global_context):
     batch = make_recordbatch(10)
     hbuf = batch.serialize()
     cbuf = cuda.serialize_record_batch(batch, global_context)
@@ -695,10 +700,11 @@ def other_process_for_test_IPC(handle_buffer, expected_arr):
 
 @cuda_ipc
 @pytest.mark.parametrize("size", [0, 1, 1000])
-def test_IPC(size):
+def test_IPC(size, global_context):
     import multiprocessing
     ctx = multiprocessing.get_context('spawn')
-    arr, cbuf = make_random_buffer(size=size, target='device')
+    arr, cbuf = make_random_buffer(size=size, target='device',
+                                   context=global_context)
     ipc_handle = cbuf.export_for_ipc()
     handle_buffer = ipc_handle.serialize()
     p = ctx.Process(target=other_process_for_test_IPC,
diff --git a/python/pyarrow/tests/test_cuda_numba_interop.py b/python/pyarrow/tests/test_cuda_numba_interop.py
index ff1722d278d5e..6faae0c735380 100644
--- a/python/pyarrow/tests/test_cuda_numba_interop.py
+++ b/python/pyarrow/tests/test_cuda_numba_interop.py
@@ -26,7 +26,9 @@
 from numba.cuda.cudadrv.devicearray import DeviceNDArray  # noqa: E402
 
 
-context_choices = None
+# TODO(ARROW-8174): Refactor context_choices in test_cuda_numba_interop
+#                   to be a module level fixture
+context_choices = None  # type: ignore
 context_choice_ids = ['pyarrow.cuda', 'numba.cuda']
 
 
diff --git a/python/pyarrow/tests/test_cython.py b/python/pyarrow/tests/test_cython.py
index 202868d5c71e2..30fd806ce7801 100644
--- a/python/pyarrow/tests/test_cython.py
+++ b/python/pyarrow/tests/test_cython.py
@@ -116,7 +116,7 @@ def test_cython_api(tmpdir):
             arr = mod.make_null_array(5)
             assert mod.get_array_length(arr) == 5
             assert arr.null_count == 5
-        """.format(mod_path=str(tmpdir), mod_name='pyarrow_cython_example')
+        """.format(mod_name='pyarrow_cython_example')
 
         if sys.platform == 'win32':
             delim, var = ';', 'PATH'
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 72fa280e67158..7fbe8bdc46373 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -34,7 +34,7 @@
 try:
     import pyarrow.dataset as ds
 except ImportError:
-    ds = None
+    ds = None  # type: ignore
 
 # Marks all of the tests in this module
 # Ignore these with pytest ... -m 'not dataset'
diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py
index 322702a2f6c38..6351a339e3774 100644
--- a/python/pyarrow/tests/test_flight.py
+++ b/python/pyarrow/tests/test_flight.py
@@ -38,7 +38,7 @@
         ClientMiddleware, ClientMiddlewareFactory,
     )
 except ImportError:
-    flight = None
+    flight = None  # type: ignore
     FlightClient, FlightServerBase = object, object
     ServerAuthHandler, ClientAuthHandler = object, object
     ServerMiddleware, ServerMiddlewareFactory = object, object
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 6936e34645165..132a460200bda 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -35,7 +35,7 @@
 try:
     import pyarrow.parquet as pq
 except ImportError:
-    pq = None
+    pq = None  # type: ignore
 
 
 try:
diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
index e54cd88a3f6e9..cea7fd3a0cde1 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -37,7 +37,7 @@
     torch = None
     # Blacklist the module in case `import torch` is costly before
     # failing (ARROW-2071)
-    sys.modules['torch'] = None
+    sys.modules['torch'] = None  # type: ignore
 
 try:
     from scipy.sparse import coo_matrix, csr_matrix, csc_matrix
diff --git a/python/requirements-test.txt b/python/requirements-test.txt
index b019eac002067..08f8796420801 100644
--- a/python/requirements-test.txt
+++ b/python/requirements-test.txt
@@ -2,6 +2,7 @@ cffi
 cython
 hypothesis==5.0; python_version <= "3.5.2"
 hypothesis; python_version > "3.5.2"
+mypy
 pandas==0.24; python_version <= "3.5.2"
 pandas; python_version > "3.5.2"
 pickle5; python_version == "3.6" or python_version == "3.7"
diff --git a/python/setup.cfg b/python/setup.cfg
index 162a507c9c668..8ed3fded6517b 100644
--- a/python/setup.cfg
+++ b/python/setup.cfg
@@ -28,3 +28,9 @@ build-dir  = doc/_build
 addopts = --ignore=scripts
 filterwarnings =
     error:The SparseDataFrame:FutureWarning
+
+[mypy]
+ignore_missing_imports=True
+no_implicit_optional=True
+check_untyped_defs=False
+strict_equality=True