[SPARK-33457][PYTHON] Adjust mypy configuration

### What changes were proposed in this pull request? This pull request: - Adds following flags to the main mypy configuration: - [`strict_optional`](https://mypy.readthedocs.io/en/stable/config_file.html#confval-strict_optional) - [`no_implicit_optional`](https://mypy.readthedocs.io/en/stable/config_file.html#confval-no_implicit_optional) - [`disallow_untyped_defs`](https://mypy.readthedocs.io/en/stable/config_file.html#confval-disallow_untyped_calls) These flags are enabled only for public API and disabled for tests and internal modules. Additionally, these PR fixes missing annotations. ### Why are the changes needed? Primary reason to propose this changes is to use standard configuration as used by typeshed project. This will allow us to be more strict, especially when interacting with JVM code. See for example #29122 (review) Additionally, it will allow us to detect cases where annotations have unintentionally omitted. ### Does this PR introduce _any_ user-facing change? Annotations only. ### How was this patch tested? `dev/lint-python`. Closes #30382 from zero323/SPARK-33457. Authored-by: zero323 <mszymkiewicz@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
apache · Nov 25, 2020 · 665817b · 665817b
1 parent 95b6dab
commit 665817b
Show file tree

Hide file tree

Showing 28 changed files with 277 additions and 114 deletions.
diff --git a/python/mypy.ini b/python/mypy.ini
@@ -16,10 +16,97 @@
 ;
 
 [mypy]
+strict_optional = True
+no_implicit_optional = True
+disallow_untyped_defs = True
+
+; Allow untyped def in internal modules and tests
+
+[mypy-pyspark.daemon]
+disallow_untyped_defs = False
+
+[mypy-pyspark.find_spark_home]
+disallow_untyped_defs = False
+
+[mypy-pyspark._globals]
+disallow_untyped_defs = False
+
+[mypy-pyspark.install]
+disallow_untyped_defs = False
+
+[mypy-pyspark.java_gateway]
+disallow_untyped_defs = False
+
+[mypy-pyspark.join]
+disallow_untyped_defs = False
+
+[mypy-pyspark.ml.tests.*]
+disallow_untyped_defs = False
+
+[mypy-pyspark.mllib.tests.*]
+disallow_untyped_defs = False
+
+[mypy-pyspark.rddsampler]
+disallow_untyped_defs = False
+
+[mypy-pyspark.resource.tests.*]
+disallow_untyped_defs = False
+
+[mypy-pyspark.serializers]
+disallow_untyped_defs = False
+
+[mypy-pyspark.shuffle]
+disallow_untyped_defs = False
+
+[mypy-pyspark.streaming.tests.*]
+disallow_untyped_defs = False
+
+[mypy-pyspark.streaming.util]
+disallow_untyped_defs = False
+
+[mypy-pyspark.sql.tests.*]
+disallow_untyped_defs = False
+
+[mypy-pyspark.sql.pandas.serializers]
+disallow_untyped_defs = False
+
+[mypy-pyspark.sql.pandas.types]
+disallow_untyped_defs = False
+
+[mypy-pyspark.sql.pandas.typehints]
+disallow_untyped_defs = False
+
+[mypy-pyspark.sql.pandas.utils]
+disallow_untyped_defs = False
+
+[mypy-pyspark.sql.pandas._typing.protocols.*]
+disallow_untyped_defs = False
+
+[mypy-pyspark.sql.utils]
+disallow_untyped_defs = False
+
+[mypy-pyspark.tests.*]
+disallow_untyped_defs = False
+
+[mypy-pyspark.testing.*]
+disallow_untyped_defs = False
+
+[mypy-pyspark.traceback_utils]
+disallow_untyped_defs = False
+
+[mypy-pyspark.util]
+disallow_untyped_defs = False
+
+[mypy-pyspark.worker]
+disallow_untyped_defs = False
+
+; Ignore errors in embedded third party code
 
 [mypy-pyspark.cloudpickle.*]
 ignore_errors = True
 
+; Ignore missing imports for external untyped packages
+
 [mypy-py4j.*]
 ignore_missing_imports = True
 

diff --git a/python/pyspark/broadcast.pyi b/python/pyspark/broadcast.pyi
@@ -17,7 +17,7 @@
 # under the License.
 
 import threading
-from typing import Any, Dict, Generic, Optional, TypeVar
+from typing import Any, Callable, Dict, Generic, Optional, Tuple, TypeVar
 
 T = TypeVar("T")
 
@@ -32,14 +32,14 @@ class Broadcast(Generic[T]):
         path: Optional[Any] = ...,
         sock_file: Optional[Any] = ...,
     ) -> None: ...
-    def dump(self, value: Any, f: Any) -> None: ...
-    def load_from_path(self, path: Any): ...
-    def load(self, file: Any): ...
+    def dump(self, value: T, f: Any) -> None: ...
+    def load_from_path(self, path: Any) -> T: ...
+    def load(self, file: Any) -> T: ...
     @property
     def value(self) -> T: ...
     def unpersist(self, blocking: bool = ...) -> None: ...
     def destroy(self, blocking: bool = ...) -> None: ...
-    def __reduce__(self): ...
+    def __reduce__(self) -> Tuple[Callable[[int], T], Tuple[int]]: ...
 
 class BroadcastPickleRegistry(threading.local):
     def __init__(self) -> None: ...

diff --git a/python/pyspark/context.pyi b/python/pyspark/context.pyi
@@ -16,7 +16,19 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, TypeVar
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    NoReturn,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+)
+from types import TracebackType
 
 from py4j.java_gateway import JavaGateway, JavaObject  # type: ignore[import]
 
@@ -51,9 +63,14 @@ class SparkContext:
         jsc: Optional[JavaObject] = ...,
         profiler_cls: type = ...,
     ) -> None: ...
-    def __getnewargs__(self): ...
-    def __enter__(self): ...
-    def __exit__(self, type, value, trace): ...
+    def __getnewargs__(self) -> NoReturn: ...
+    def __enter__(self) -> SparkContext: ...
+    def __exit__(
+        self,
+        type: Optional[Type[BaseException]],
+        value: Optional[BaseException],
+        trace: Optional[TracebackType],
+    ) -> None: ...
     @classmethod
     def getOrCreate(cls, conf: Optional[SparkConf] = ...) -> SparkContext: ...
     def setLogLevel(self, logLevel: str) -> None: ...

diff --git a/python/pyspark/ml/classification.pyi b/python/pyspark/ml/classification.pyi
@@ -107,7 +107,7 @@ class _JavaProbabilisticClassifier(
 class _JavaProbabilisticClassificationModel(
     ProbabilisticClassificationModel, _JavaClassificationModel[T]
 ):
-    def predictProbability(self, value: Any): ...
+    def predictProbability(self, value: Vector) -> Vector: ...
 
 class _ClassificationSummary(JavaWrapper):
     @property
@@ -543,7 +543,7 @@ class RandomForestClassificationModel(
     @property
     def trees(self) -> List[DecisionTreeClassificationModel]: ...
     def summary(self) -> RandomForestClassificationTrainingSummary: ...
-    def evaluate(self, dataset) -> RandomForestClassificationSummary: ...
+    def evaluate(self, dataset: DataFrame) -> RandomForestClassificationSummary: ...
 
 class RandomForestClassificationSummary(_ClassificationSummary): ...
 class RandomForestClassificationTrainingSummary(
@@ -891,7 +891,7 @@ class FMClassifier(
         solver: str = ...,
         thresholds: Optional[Any] = ...,
         seed: Optional[Any] = ...,
-    ): ...
+    ) -> FMClassifier: ...
     def setFactorSize(self, value: int) -> FMClassifier: ...
     def setFitLinear(self, value: bool) -> FMClassifier: ...
     def setMiniBatchFraction(self, value: float) -> FMClassifier: ...

diff --git a/python/pyspark/ml/common.pyi b/python/pyspark/ml/common.pyi
@@ -16,5 +16,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
-def callJavaFunc(sc, func, *args): ...
-def inherit_doc(cls): ...
+from typing import Any, TypeVar
+
+import pyspark.context
+
+C = TypeVar("C", bound=type)
+
+def callJavaFunc(sc: pyspark.context.SparkContext, func: Any, *args: Any) -> Any: ...
+def inherit_doc(cls: C) -> C: ...
diff --git a/python/pyspark/ml/evaluation.pyi b/python/pyspark/ml/evaluation.pyi
@@ -39,9 +39,12 @@ from pyspark.ml.param.shared import (
     HasWeightCol,
 )
 from pyspark.ml.util import JavaMLReadable, JavaMLWritable
+from pyspark.sql.dataframe import DataFrame
 
 class Evaluator(Params, metaclass=abc.ABCMeta):
-    def evaluate(self, dataset, params: Optional[ParamMap] = ...) -> float: ...
+    def evaluate(
+        self, dataset: DataFrame, params: Optional[ParamMap] = ...
+    ) -> float: ...
     def isLargerBetter(self) -> bool: ...
 
 class JavaEvaluator(JavaParams, Evaluator, metaclass=abc.ABCMeta):
@@ -75,16 +78,15 @@ class BinaryClassificationEvaluator(
     def setLabelCol(self, value: str) -> BinaryClassificationEvaluator: ...
     def setRawPredictionCol(self, value: str) -> BinaryClassificationEvaluator: ...
     def setWeightCol(self, value: str) -> BinaryClassificationEvaluator: ...
-
-def setParams(
-    self,
-    *,
-    rawPredictionCol: str = ...,
-    labelCol: str = ...,
-    metricName: BinaryClassificationEvaluatorMetricType = ...,
-    weightCol: Optional[str] = ...,
-    numBins: int = ...
-) -> BinaryClassificationEvaluator: ...
+    def setParams(
+        self,
+        *,
+        rawPredictionCol: str = ...,
+        labelCol: str = ...,
+        metricName: BinaryClassificationEvaluatorMetricType = ...,
+        weightCol: Optional[str] = ...,
+        numBins: int = ...
+    ) -> BinaryClassificationEvaluator: ...
 
 class RegressionEvaluator(
     JavaEvaluator,

diff --git a/python/pyspark/ml/feature.pyi b/python/pyspark/ml/feature.pyi
@@ -100,9 +100,9 @@ class _LSHParams(HasInputCol, HasOutputCol):
     def getNumHashTables(self) -> int: ...
 
 class _LSH(Generic[JM], JavaEstimator[JM], _LSHParams, JavaMLReadable, JavaMLWritable):
-    def setNumHashTables(self: P, value) -> P: ...
-    def setInputCol(self: P, value) -> P: ...
-    def setOutputCol(self: P, value) -> P: ...
+    def setNumHashTables(self: P, value: int) -> P: ...
+    def setInputCol(self: P, value: str) -> P: ...
+    def setOutputCol(self: P, value: str) -> P: ...
 
 class _LSHModel(JavaModel, _LSHParams):
     def setInputCol(self: P, value: str) -> P: ...
@@ -1518,7 +1518,7 @@ class ChiSqSelector(
         fpr: float = ...,
         fdr: float = ...,
         fwe: float = ...
-    ): ...
+    ) -> ChiSqSelector: ...
     def setSelectorType(self, value: str) -> ChiSqSelector: ...
     def setNumTopFeatures(self, value: int) -> ChiSqSelector: ...
     def setPercentile(self, value: float) -> ChiSqSelector: ...
@@ -1602,7 +1602,10 @@ class _VarianceThresholdSelectorParams(HasFeaturesCol, HasOutputCol):
     def getVarianceThreshold(self) -> float: ...
 
 class VarianceThresholdSelector(
-    JavaEstimator, _VarianceThresholdSelectorParams, JavaMLReadable, JavaMLWritable
+    JavaEstimator[VarianceThresholdSelectorModel],
+    _VarianceThresholdSelectorParams,
+    JavaMLReadable[VarianceThresholdSelector],
+    JavaMLWritable,
 ):
     def __init__(
         self,
@@ -1615,13 +1618,16 @@ class VarianceThresholdSelector(
         featuresCol: str = ...,
         outputCol: Optional[str] = ...,
         varianceThreshold: float = ...,
-    ): ...
+    ) -> VarianceThresholdSelector: ...
     def setVarianceThreshold(self, value: float) -> VarianceThresholdSelector: ...
     def setFeaturesCol(self, value: str) -> VarianceThresholdSelector: ...
     def setOutputCol(self, value: str) -> VarianceThresholdSelector: ...
 
 class VarianceThresholdSelectorModel(
-    JavaModel, _VarianceThresholdSelectorParams, JavaMLReadable, JavaMLWritable
+    JavaModel,
+    _VarianceThresholdSelectorParams,
+    JavaMLReadable[VarianceThresholdSelectorModel],
+    JavaMLWritable,
 ):
     def setFeaturesCol(self, value: str) -> VarianceThresholdSelectorModel: ...
     def setOutputCol(self, value: str) -> VarianceThresholdSelectorModel: ...