[SPARK-32719][PYTHON] Add Flake8 check missing imports

https://issues.apache.org/jira/browse/SPARK-32719 ### What changes were proposed in this pull request? Add a check to detect missing imports. This makes sure that if we use a specific class, it should be explicitly imported (not using a wildcard). ### Why are the changes needed? To make sure that the quality of the Python code is up to standard. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing unit-tests and Flake8 static analysis Closes #29563 from Fokko/fd-add-check-missing-imports. Authored-by: Fokko Driesprong <fokko@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
apache · Aug 31, 2020 · a1e459e · a1e459e
1 parent 6dacba7
commit a1e459e
Show file tree

Hide file tree

Showing 38 changed files with 111 additions and 66 deletions.
diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py
@@ -22,7 +22,9 @@
 import re
 import sys
 
-from releaseutils import *
+from releaseutils import tag_exists, raw_input, get_commits, yesOrNoPrompt, get_date, \
+    is_valid_author, capitalize_author, JIRA, find_components, translate_issue_type, \
+    translate_component, CORE_COMPONENT, contributors_file_name, nice_join
 
 # You must set the following before use!
 JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")

diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py
@@ -31,7 +31,15 @@
 import os
 import sys
 
-from releaseutils import *
+from releaseutils import JIRA, JIRAError, get_jira_name, Github, get_github_name, \
+    contributors_file_name, is_valid_author, raw_input, capitalize_author, yesOrNoPrompt
+
+try:
+    import unidecode
+except ImportError:
+    print("This tool requires the unidecode library to decode obscure github usernames")
+    print("Install using 'sudo pip install unidecode'")
+    sys.exit(-1)
 
 # You must set the following before use!
 JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
@@ -135,7 +143,7 @@ def generate_candidates(author, issues):
     # Note that the candidate name may already be in unicode (JIRA returns this)
     for i, (candidate, source) in enumerate(candidates):
         try:
-            candidate = unicode(candidate, "UTF-8")
+            candidate = unicode(candidate, "UTF-8")  # noqa: F821
         except TypeError:
             # already in unicode
             pass

diff --git a/dev/tox.ini b/dev/tox.ini
@@ -19,6 +19,6 @@ max-line-length=100
 exclude=python/pyspark/cloudpickle/*.py,shared.py,python/docs/source/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/*
 
 [flake8]
-select = E901,E999,F821,F822,F823,F401
+select = E901,E999,F821,F822,F823,F401,F405
 exclude = python/pyspark/cloudpickle/*.py,shared.py,python/docs/source/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/*
 max-line-length = 100
diff --git a/examples/src/main/python/sql/basic.py b/examples/src/main/python/sql/basic.py
@@ -30,7 +30,7 @@
 
 # $example on:programmatic_schema$
 # Import data types
-from pyspark.sql.types import *
+from pyspark.sql.types import StringType, StructType, StructField
 # $example off:programmatic_schema$
 
 

diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
@@ -53,12 +53,12 @@
 from pyspark.context import SparkContext
 from pyspark.rdd import RDD, RDDBarrier
 from pyspark.files import SparkFiles
+from pyspark.status import StatusTracker, SparkJobInfo, SparkStageInfo
 from pyspark.util import InheritableThread
 from pyspark.storagelevel import StorageLevel
 from pyspark.accumulators import Accumulator, AccumulatorParam
 from pyspark.broadcast import Broadcast
 from pyspark.serializers import MarshalSerializer, PickleSerializer
-from pyspark.status import *
 from pyspark.taskcontext import TaskContext, BarrierTaskContext, BarrierTaskInfo
 from pyspark.profiler import Profiler, BasicProfiler
 from pyspark.version import __version__  # noqa: F401

diff --git a/python/pyspark/ml/base.py b/python/pyspark/ml/base.py
@@ -21,8 +21,9 @@
 import threading
 
 from pyspark import since
-from pyspark.ml.param.shared import *
 from pyspark.ml.common import inherit_doc
+from pyspark.ml.param.shared import HasInputCol, HasOutputCol, HasLabelCol, HasFeaturesCol, \
+    HasPredictionCol, Params
 from pyspark.sql.functions import udf
 from pyspark.sql.types import StructField, StructType
 

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
@@ -16,19 +16,24 @@
 #
 
 import operator
+import sys
+import uuid
 import warnings
 from abc import ABCMeta, abstractmethod, abstractproperty
 from multiprocessing.pool import ThreadPool
 
-from pyspark import keyword_only
+from pyspark import keyword_only, since, SparkContext
 from pyspark.ml import Estimator, Predictor, PredictionModel, Model
-from pyspark.ml.param.shared import *
+from pyspark.ml.param.shared import HasRawPredictionCol, HasProbabilityCol, HasThresholds, \
+    HasRegParam, HasMaxIter, HasFitIntercept, HasTol, HasStandardization, HasWeightCol, \
+    HasAggregationDepth, HasThreshold, HasBlockSize, Param, Params, TypeConverters, \
+    HasElasticNetParam, HasSeed, HasStepSize, HasSolver, HasParallelism
 from pyspark.ml.tree import _DecisionTreeModel, _DecisionTreeParams, \
     _TreeEnsembleModel, _RandomForestParams, _GBTParams, \
     _HasVarianceImpurity, _TreeClassifierParams
 from pyspark.ml.regression import _FactorizationMachinesParams, DecisionTreeRegressionModel
-from pyspark.ml.util import *
 from pyspark.ml.base import _PredictorParams
+from pyspark.ml.util import JavaMLWritable, JavaMLReadable, HasTrainingSummary
 from pyspark.ml.wrapper import JavaParams, \
     JavaPredictor, JavaPredictionModel, JavaWrapper
 from pyspark.ml.common import inherit_doc, _java2py, _py2java

diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
@@ -19,9 +19,12 @@
 import warnings
 
 from pyspark import since, keyword_only
-from pyspark.ml.util import *
+from pyspark.ml.param.shared import HasMaxIter, HasFeaturesCol, HasSeed, HasPredictionCol, \
+    HasAggregationDepth, HasWeightCol, HasTol, HasProbabilityCol, HasBlockSize, \
+    HasDistanceMeasure, HasCheckpointInterval, Param, Params, TypeConverters
+from pyspark.ml.util import JavaMLWritable, JavaMLReadable, GeneralJavaMLWritable, \
+    HasTrainingSummary, SparkContext
 from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, JavaWrapper
-from pyspark.ml.param.shared import *
 from pyspark.ml.common import inherit_doc, _java2py
 from pyspark.ml.stat import MultivariateGaussian
 from pyspark.sql import DataFrame

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
@@ -17,7 +17,9 @@
 
 from pyspark import since, keyword_only, SparkContext
 from pyspark.ml.linalg import _convert_to_vector
-from pyspark.ml.param.shared import *
+from pyspark.ml.param.shared import HasThreshold, HasThresholds, HasInputCol, HasOutputCol, \
+    HasInputCols, HasOutputCols, HasHandleInvalid, HasRelativeError, HasFeaturesCol, HasLabelCol, \
+    HasSeed, HasNumFeatures, HasStepSize, HasMaxIter, TypeConverters, Param, Params
 from pyspark.ml.util import JavaMLReadable, JavaMLWritable
 from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams, JavaTransformer, _jvm
 from pyspark.ml.common import inherit_doc

diff --git a/python/pyspark/ml/fpm.py b/python/pyspark/ml/fpm.py
@@ -15,11 +15,13 @@
 # limitations under the License.
 #
 
-from pyspark import keyword_only
+import sys
+
+from pyspark import keyword_only, since
 from pyspark.sql import DataFrame
-from pyspark.ml.util import *
+from pyspark.ml.util import JavaMLWritable, JavaMLReadable
 from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams
-from pyspark.ml.param.shared import *
+from pyspark.ml.param.shared import HasPredictionCol, Param, TypeConverters, Params
 
 __all__ = ["FPGrowth", "FPGrowthModel", "PrefixSpan"]
 

diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py
@@ -14,11 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import os
 
-from pyspark import keyword_only
+from pyspark import keyword_only, since, SparkContext
 from pyspark.ml.base import Estimator, Model, Transformer
 from pyspark.ml.param import Param, Params
-from pyspark.ml.util import *
+from pyspark.ml.util import MLReadable, MLWritable, JavaMLWriter, JavaMLReader, \
+    DefaultParamsReader, DefaultParamsWriter, MLWriter, MLReader, JavaMLWritable
 from pyspark.ml.wrapper import JavaParams, JavaWrapper
 from pyspark.ml.common import inherit_doc, _java2py, _py2java
 

diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py
@@ -18,10 +18,12 @@
 import sys
 
 from pyspark import since, keyword_only
-from pyspark.ml.util import *
+from pyspark.ml.param.shared import HasPredictionCol, HasBlockSize, HasMaxIter, HasRegParam, \
+    HasCheckpointInterval, HasSeed
 from pyspark.ml.wrapper import JavaEstimator, JavaModel
-from pyspark.ml.param.shared import *
 from pyspark.ml.common import inherit_doc
+from pyspark.ml.param import Params, TypeConverters, Param
+from pyspark.ml.util import JavaMLWritable, JavaMLReadable
 
 
 __all__ = ['ALS', 'ALSModel']

diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
@@ -15,15 +15,21 @@
 # limitations under the License.
 #
 
+import sys
+
 from abc import ABCMeta
 
-from pyspark import keyword_only
+from pyspark import keyword_only, since
 from pyspark.ml import Predictor, PredictionModel
 from pyspark.ml.base import _PredictorParams
-from pyspark.ml.param.shared import *
+from pyspark.ml.param.shared import HasFeaturesCol, HasLabelCol, HasPredictionCol, HasWeightCol, \
+    Param, Params, TypeConverters, HasMaxIter, HasTol, HasFitIntercept, HasAggregationDepth, \
+    HasBlockSize, HasRegParam, HasSolver, HasStepSize, HasSeed, HasElasticNetParam, \
+    HasStandardization, HasLoss, HasVarianceCol
 from pyspark.ml.tree import _DecisionTreeModel, _DecisionTreeParams, \
     _TreeEnsembleModel, _RandomForestParams, _GBTParams, _TreeRegressorParams
-from pyspark.ml.util import *
+from pyspark.ml.util import JavaMLWritable, JavaMLReadable, HasTrainingSummary, \
+    GeneralJavaMLWritable
 from pyspark.ml.wrapper import JavaEstimator, JavaModel, \
     JavaPredictor, JavaPredictionModel, JavaWrapper
 from pyspark.ml.common import inherit_doc

diff --git a/python/pyspark/ml/tree.py b/python/pyspark/ml/tree.py
@@ -15,8 +15,10 @@
 # limitations under the License.
 #
 
-from pyspark.ml.param.shared import *
-from pyspark.ml.util import *
+from pyspark import since
+from pyspark.ml.param import Params
+from pyspark.ml.param.shared import HasCheckpointInterval, HasSeed, HasWeightCol, Param, \
+    TypeConverters, HasMaxIter, HasStepSize, HasValidationIndicatorCol
 from pyspark.ml.wrapper import JavaPredictionModel
 from pyspark.ml.common import inherit_doc
 

diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
@@ -14,17 +14,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+import sys
 import itertools
 from multiprocessing.pool import ThreadPool
 
 import numpy as np
 
-from pyspark import keyword_only
+from pyspark import keyword_only, since, SparkContext
 from pyspark.ml import Estimator, Model
 from pyspark.ml.common import _py2java, _java2py
 from pyspark.ml.param import Params, Param, TypeConverters
 from pyspark.ml.param.shared import HasCollectSubModels, HasParallelism, HasSeed
-from pyspark.ml.util import *
+from pyspark.ml.util import MLReadable, MLWritable, JavaMLWriter, JavaMLReader
 from pyspark.ml.wrapper import JavaParams
 from pyspark.sql.functions import col, lit, rand, UserDefinedFunction
 from pyspark.sql.types import BooleanType

diff --git a/python/pyspark/mllib/stat/__init__.py b/python/pyspark/mllib/stat/__init__.py
@@ -19,7 +19,7 @@
 Python package for statistical functions in MLlib.
 """
 
-from pyspark.mllib.stat._statistics import *
+from pyspark.mllib.stat._statistics import Statistics, MultivariateStatisticalSummary
 from pyspark.mllib.stat.distribution import MultivariateGaussian
 from pyspark.mllib.stat.test import ChiSqTestResult
 from pyspark.mllib.stat.KernelDensity import KernelDensity

diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
@@ -21,7 +21,7 @@
 
 from pyspark import copy_func, since
 from pyspark.context import SparkContext
-from pyspark.sql.types import *
+from pyspark.sql.types import DataType, StructField, StructType, IntegerType, StringType
 
 __all__ = ["Column"]
 

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -31,7 +31,7 @@
 from pyspark.sql.column import Column, _to_seq, _to_list, _to_java_column
 from pyspark.sql.readwriter import DataFrameWriter, DataFrameWriterV2
 from pyspark.sql.streaming import DataStreamWriter
-from pyspark.sql.types import *
+from pyspark.sql.types import StructType, StructField, StringType, IntegerType
 from pyspark.sql.pandas.conversion import PandasConversionMixin
 from pyspark.sql.pandas.map_ops import PandasMapOpsMixin
 

diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
@@ -21,7 +21,7 @@
 from pyspark.sql.column import Column, _to_seq
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.pandas.group_ops import PandasGroupedOpsMixin
-from pyspark.sql.types import *
+from pyspark.sql.types import StructType, StructField, IntegerType, StringType
 
 __all__ = ["GroupedData"]
 

diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py
@@ -22,7 +22,8 @@
 from pyspark.rdd import _load_from_socket
 from pyspark.sql.pandas.serializers import ArrowCollectSerializer
 from pyspark.sql.types import IntegralType
-from pyspark.sql.types import *
+from pyspark.sql.types import ByteType, ShortType, IntegerType, LongType, FloatType, \
+    DoubleType, BooleanType, TimestampType, StructType, DataType
 from pyspark.traceback_utils import SCCallSiteSync
 
 

diff --git a/python/pyspark/sql/pandas/types.py b/python/pyspark/sql/pandas/types.py
@@ -20,7 +20,9 @@
 pandas instances during the type conversion.
 """
 
-from pyspark.sql.types import *
+from pyspark.sql.types import ByteType, ShortType, IntegerType, LongType, FloatType, \
+    DoubleType, DecimalType, StringType, BinaryType, DateType, TimestampType, ArrayType, \
+    StructType, StructField, BooleanType
 
 
 def to_arrow_type(dt):

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -14,12 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import sys
 
 from py4j.java_gateway import JavaClass
 
 from pyspark import RDD, since
 from pyspark.sql.column import _to_seq, _to_java_column
-from pyspark.sql.types import *
+from pyspark.sql.types import StructType
 from pyspark.sql import utils
 from pyspark.sql.utils import to_str
 
@@ -1225,7 +1226,6 @@ def overwrite(self, condition):
         Overwrite rows matching the given filter condition with the contents of the data frame in
         the output table.
         """
-        condition = _to_java_column(column)
         self._jwriter.overwrite(condition)
 
     @since(3.1)

diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
@@ -23,7 +23,7 @@
 from pyspark import since, keyword_only
 from pyspark.sql.column import _to_seq
 from pyspark.sql.readwriter import OptionUtils, to_str
-from pyspark.sql.types import *
+from pyspark.sql.types import StructType, StructField, StringType
 from pyspark.sql.utils import ForeachBatchFunction, StreamingQueryException
 
 __all__ = ["StreamingQuery", "StreamingQueryManager", "DataStreamReader", "DataStreamWriter"]
@@ -1239,8 +1239,8 @@ def _test():
     globs = pyspark.sql.streaming.__dict__.copy()
     try:
         spark = SparkSession.builder.getOrCreate()
-    except py4j.protocol.Py4JError:
-        spark = SparkSession(sc)
+    except py4j.protocol.Py4JError:  # noqa: F821
+        spark = SparkSession(sc)  # noqa: F821
 
     globs['tempfile'] = tempfile
     globs['os'] = os

diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
@@ -25,7 +25,9 @@
 from pyspark import SparkContext, SparkConf
 from pyspark.sql import Row, SparkSession
 from pyspark.sql.functions import udf
-from pyspark.sql.types import *
+from pyspark.sql.types import StructType, StringType, IntegerType, LongType, \
+    FloatType, DoubleType, DecimalType, DateType, TimestampType, BinaryType, StructField, MapType, \
+    ArrayType
 from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \
     pandas_requirement_message, pyarrow_requirement_message
 from pyspark.testing.utils import QuietTest
@@ -495,7 +497,7 @@ def conf(cls):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_arrow import *
+    from pyspark.sql.tests.test_arrow import *  # noqa: F401
 
     try:
         import xmlrunner

diff --git a/python/pyspark/sql/tests/test_column.py b/python/pyspark/sql/tests/test_column.py
@@ -17,7 +17,7 @@
 #
 
 from pyspark.sql import Column, Row
-from pyspark.sql.types import *
+from pyspark.sql.types import StructType, StructField, LongType
 from pyspark.sql.utils import AnalysisException
 from pyspark.testing.sqlutils import ReusedSQLTestCase
 

diff --git a/python/pyspark/sql/tests/test_context.py b/python/pyspark/sql/tests/test_context.py
@@ -25,7 +25,7 @@
 
 from pyspark import SparkContext, SQLContext
 from pyspark.sql import Row, SparkSession
-from pyspark.sql.types import *
+from pyspark.sql.types import StructType, StringType, StructField
 from pyspark.sql.window import Window
 from pyspark.testing.utils import ReusedPySparkTestCase
 

diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py
@@ -23,7 +23,8 @@
 import unittest
 
 from pyspark.sql import SparkSession, Row
-from pyspark.sql.types import *
+from pyspark.sql.types import StringType, IntegerType, DoubleType, StructType, StructField, \
+    BooleanType, DateType, TimestampType, FloatType
 from pyspark.sql.utils import AnalysisException, IllegalArgumentException
 from pyspark.testing.sqlutils import ReusedSQLTestCase, SQLTestUtils, have_pyarrow, have_pandas, \
     pandas_requirement_message, pyarrow_requirement_message
@@ -903,7 +904,7 @@ def test_query_execution_listener_on_collect_with_arrow(self):
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.test_dataframe import *
+    from pyspark.sql.tests.test_dataframe import *  # noqa: F401
 
     try:
         import xmlrunner