diff --git a/dev/pyproject.toml b/dev/pyproject.toml index 8d556ba2ca911..019b6a033983e 100644 --- a/dev/pyproject.toml +++ b/dev/pyproject.toml @@ -31,4 +31,4 @@ required-version = "22.6.0" line-length = 100 target-version = ['py37'] include = '\.pyi?$' -extend-exclude = 'cloudpickle' +extend-exclude = 'cloudpickle|error_classes.py' diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index f63e720659936..90bb3e1dc88b9 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -756,6 +756,16 @@ def __hash__(self): ], ) +pyspark_errors = Module( + name="pyspark-errors", + dependencies=[], + source_file_regexes=["python/pyspark/errors"], + python_test_goals=[ + # unittests + "pyspark.errors.tests.test_errors", + ], +) + sparkr = Module( name="sparkr", dependencies=[hive, mllib], diff --git a/dev/tox.ini b/dev/tox.ini index 15c93832c2ceb..50ef4b21ab6b8 100644 --- a/dev/tox.ini +++ b/dev/tox.ini @@ -30,6 +30,7 @@ per-file-ignores = # Examples contain some unused variables. examples/src/main/python/sql/datasource.py: F841, # Exclude * imports in test files + python/pyspark/errors/tests/*.py: F403, python/pyspark/ml/tests/*.py: F403, python/pyspark/mllib/tests/*.py: F403, python/pyspark/pandas/tests/*.py: F401 F403, diff --git a/python/docs/source/reference/index.rst b/python/docs/source/reference/index.rst index 2f316924405e4..a74b4a82e0209 100644 --- a/python/docs/source/reference/index.rst +++ b/python/docs/source/reference/index.rst @@ -35,3 +35,4 @@ Pandas API on Spark follows the API specifications of latest pandas release. pyspark.mllib pyspark pyspark.resource + pyspark.errors diff --git a/python/docs/source/reference/pyspark.errors.rst b/python/docs/source/reference/pyspark.errors.rst new file mode 100644 index 0000000000000..d18be18fe82e2 --- /dev/null +++ b/python/docs/source/reference/pyspark.errors.rst @@ -0,0 +1,29 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + + +====== +Errors +====== + +.. currentmodule:: pyspark.errors + +.. autosummary:: + :toctree: api/ + + PySparkException.getErrorClass + PySparkException.getMessageParameters diff --git a/python/mypy.ini b/python/mypy.ini index dd1c1cd4875c9..5f662a4a2375b 100644 --- a/python/mypy.ini +++ b/python/mypy.ini @@ -106,6 +106,9 @@ ignore_errors = True [mypy-pyspark.testing.*] ignore_errors = True +[mypy-pyspark.errors.tests.*] +ignore_errors = True + ; Allow non-strict optional for pyspark.pandas [mypy-pyspark.pandas.*] diff --git a/python/pyspark/errors/__init__.py b/python/pyspark/errors/__init__.py new file mode 100644 index 0000000000000..84260aa0fdaac --- /dev/null +++ b/python/pyspark/errors/__init__.py @@ -0,0 +1,26 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +PySpark exceptions. +""" +from pyspark.errors.exceptions import PySparkException + + +__all__ = [ + "PySparkException", +] diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py new file mode 100644 index 0000000000000..8e2a6ed74ca56 --- /dev/null +++ b/python/pyspark/errors/error_classes.py @@ -0,0 +1,30 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import json + + +ERROR_CLASSES_JSON = """ +{ + "COLUMN_IN_LIST": { + "message": [ + " does not allow a column in a list." + ] + } +} +""" + +ERROR_CLASSES_MAP = json.loads(ERROR_CLASSES_JSON) diff --git a/python/pyspark/errors/exceptions.py b/python/pyspark/errors/exceptions.py new file mode 100644 index 0000000000000..bda51ecffb27a --- /dev/null +++ b/python/pyspark/errors/exceptions.py @@ -0,0 +1,76 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Dict, Optional, cast + +from pyspark.errors.utils import ErrorClassesReader + + +class PySparkException(Exception): + """ + Base Exception for handling errors generated from PySpark. + """ + + def __init__( + self, + message: Optional[str] = None, + error_class: Optional[str] = None, + message_parameters: Optional[Dict[str, str]] = None, + ): + # `message` vs `error_class` & `message_parameters` are mutually exclusive. + assert (message is not None and (error_class is None and message_parameters is None)) or ( + message is None and (error_class is not None and message_parameters is not None) + ) + + self.error_reader = ErrorClassesReader() + + if message is None: + self.message = self.error_reader.get_error_message( + cast(str, error_class), cast(Dict[str, str], message_parameters) + ) + else: + self.message = message + + self.error_class = error_class + self.message_parameters = message_parameters + + def getErrorClass(self) -> Optional[str]: + """ + Returns an error class as a string. + + .. versionadded:: 3.4.0 + + See Also + -------- + :meth:`PySparkException.getMessageParameters` + """ + return self.error_class + + def getMessageParameters(self) -> Optional[Dict[str, str]]: + """ + Returns a message parameters as a dictionary. + + .. versionadded:: 3.4.0 + + See Also + -------- + :meth:`PySparkException.getErrorClass` + """ + return self.message_parameters + + def __str__(self) -> str: + return f"[{self.getErrorClass()}] {self.message}" diff --git a/python/pyspark/errors/tests/__init__.py b/python/pyspark/errors/tests/__init__.py new file mode 100644 index 0000000000000..cce3acad34a49 --- /dev/null +++ b/python/pyspark/errors/tests/__init__.py @@ -0,0 +1,16 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/python/pyspark/errors/tests/test_errors.py b/python/pyspark/errors/tests/test_errors.py new file mode 100644 index 0000000000000..cd2a8a4a22cb4 --- /dev/null +++ b/python/pyspark/errors/tests/test_errors.py @@ -0,0 +1,47 @@ +# -*- encoding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +from pyspark.errors.utils import ErrorClassesReader + + +class ErrorsTest(unittest.TestCase): + def test_error_classes(self): + # Test error classes is sorted alphabetically + error_reader = ErrorClassesReader() + error_class_names = error_reader.error_info_map + for i in range(len(error_class_names) - 1): + self.assertTrue( + error_class_names[i] < error_class_names[i + 1], + f"Error class [{error_class_names[i]}] should place" + f"after [{error_class_names[i + 1]}]", + ) + + +if __name__ == "__main__": + import unittest + from pyspark.errors.tests.test_errors import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/errors/utils.py b/python/pyspark/errors/utils.py new file mode 100644 index 0000000000000..69a72f86b9fe0 --- /dev/null +++ b/python/pyspark/errors/utils.py @@ -0,0 +1,116 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import re +from typing import Dict + +from pyspark.errors.error_classes import ERROR_CLASSES_MAP + + +class ErrorClassesReader: + """ + A reader to load error information from error_classes.py. + """ + + def __init__(self) -> None: + self.error_info_map = ERROR_CLASSES_MAP + + def get_error_message(self, error_class: str, message_parameters: Dict[str, str]) -> str: + """ + Returns the completed error message by applying message parameters to the message template. + """ + message_template = self.get_message_template(error_class) + # Verify message parameters. + message_parameters_from_template = re.findall("<([a-zA-Z0-9_-]+)>", message_template) + assert set(message_parameters_from_template) == set(message_parameters), ( + f"Undifined error message parameter for error class: {error_class}. " + f"Parameters: {message_parameters}" + ) + table = str.maketrans("<>", "{}") + + return message_template.translate(table).format(**message_parameters) + + def get_message_template(self, error_class: str) -> str: + """ + Returns the message template for corresponding error class from error_classes.py. + + For example, + when given `error_class` is "EXAMPLE_ERROR_CLASS", + and corresponding error class in error_classes.py looks like the below: + + .. code-block:: python + + "EXAMPLE_ERROR_CLASS" : { + "message" : [ + "Problem because of ." + ] + } + + In this case, this function returns: + "Problem because of ." + + For sub error class, when given `error_class` is "EXAMPLE_ERROR_CLASS.SUB_ERROR_CLASS", + and corresponding error class in error_classes.py looks like the below: + + .. code-block:: python + + "EXAMPLE_ERROR_CLASS" : { + "message" : [ + "Problem because of ." + ], + "subClass" : { + "SUB_ERROR_CLASS" : { + "message" : [ + "Do to fix the problem." + ] + } + } + } + + In this case, this function returns: + "Problem because . Do to fix the problem." + """ + error_classes = error_class.split(".") + len_error_classes = len(error_classes) + assert len_error_classes in (1, 2) + + # Generate message template for main error class. + main_error_class = error_classes[0] + if main_error_class in self.error_info_map: + main_error_class_info_map = self.error_info_map[main_error_class] + else: + raise ValueError(f"Cannot find main error class '{main_error_class}'") + + main_message_template = "\n".join(main_error_class_info_map["message"]) + + has_sub_class = len_error_classes == 2 + + if not has_sub_class: + message_template = main_message_template + else: + # Generate message template for sub error class if exists. + sub_error_class = error_classes[1] + main_error_class_subclass_info_map = main_error_class_info_map["subClass"] + if sub_error_class in main_error_class_subclass_info_map: + sub_error_class_info_map = main_error_class_subclass_info_map[sub_error_class] + else: + raise ValueError(f"Cannot find sub error class '{sub_error_class}'") + + sub_message_template = "\n".join(sub_error_class_info_map["message"]) + message_template = main_message_template + " " + sub_message_template + + return message_template diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 73698afa4e346..4a2ec37ba1b6a 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -38,6 +38,7 @@ ) from pyspark import SparkContext +from pyspark.errors import PySparkException from pyspark.rdd import PythonEvalType from pyspark.sql.column import Column, _to_java_column, _to_seq, _create_column_from_literal from pyspark.sql.dataframe import DataFrame @@ -172,7 +173,9 @@ def lit(col: Any) -> Column: return col elif isinstance(col, list): if any(isinstance(c, Column) for c in col): - raise ValueError("lit does not allow a column in a list") + raise PySparkException( + error_class="COLUMN_IN_LIST", message_parameters={"func_name": "lit"} + ) return array(*[lit(item) for item in col]) else: if has_numpy and isinstance(col, np.generic): diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 38a4e3e6644cf..11cd60833f557 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -25,6 +25,7 @@ import unittest from py4j.protocol import Py4JJavaError +from pyspark.errors import PySparkException from pyspark.sql import Row, Window, types from pyspark.sql.functions import ( udf, @@ -1033,9 +1034,15 @@ def test_lit_list(self): self.assertEqual(actual, expected) df = self.spark.range(10) - with self.assertRaisesRegex(ValueError, "lit does not allow a column in a list"): + with self.assertRaises(PySparkException) as pe: lit([df.id, df.id]) + self.check_error( + exception=pe.exception, + error_class="COLUMN_IN_LIST", + message_parameters={"funcName": "lit"}, + ) + # Test added for SPARK-39832; change Python API to accept both col & str as input def test_regexp_replace(self): df = self.spark.createDataFrame( diff --git a/python/pyspark/testing/utils.py b/python/pyspark/testing/utils.py index bfa07dc9459c3..0a61b80952884 100644 --- a/python/pyspark/testing/utils.py +++ b/python/pyspark/testing/utils.py @@ -21,8 +21,10 @@ import sys import unittest from time import time, sleep +from typing import Dict, Optional from pyspark import SparkContext, SparkConf +from pyspark.errors import PySparkException from pyspark.find_spark_home import _find_spark_home @@ -138,6 +140,33 @@ def setUpClass(cls): def tearDownClass(cls): cls.sc.stop() + def check_error( + self, + exception: PySparkException, + error_class: str, + message_parameters: Optional[Dict[str, str]] = None, + ): + # Test if given error is an instance of PySparkException. + self.assertIsInstance( + exception, + PySparkException, + f"checkError requires 'PySparkException', got '{exception.__class__.__name__}'.", + ) + + # Test error class + expected = error_class + actual = exception.getErrorClass() + self.assertEqual( + expected, actual, f"Expected error class was '{expected}', got '{actual}'." + ) + + # Test message parameters + expected = message_parameters + actual = exception.getMessageParameters() + self.assertEqual( + expected, actual, f"Expected message parameters was '{expected}', got '{actual}'" + ) + class ByteArrayOutput: def __init__(self): diff --git a/python/setup.py b/python/setup.py index 08ffd0f0b1e87..e475333b24e38 100755 --- a/python/setup.py +++ b/python/setup.py @@ -256,6 +256,7 @@ def run(self): "pyspark.data", "pyspark.licenses", "pyspark.resource", + "pyspark.errors", "pyspark.examples.src.main.python", ], include_package_data=True,