From 95b6dabc33515f1975eb889480ccca12bf5ac3c8 Mon Sep 17 00:00:00 2001 From: Gabor Somogyi Date: Wed, 25 Nov 2020 07:38:45 +0900 Subject: [PATCH 001/150] [SPARK-33287][SS][UI] Expose state custom metrics information on SS UI ### What changes were proposed in this pull request? Structured Streaming UI is not containing state custom metrics information. In this PR I've added it. ### Why are the changes needed? Missing state custom metrics information. ### Does this PR introduce _any_ user-facing change? Additional UI elements appear. ### How was this patch tested? Existing unit tests + manual test. ``` #Compile Spark echo "spark.sql.streaming.ui.enabledCustomMetricList stateOnCurrentVersionSizeBytes" >> conf/spark-defaults.conf sbin/start-master.sh sbin/start-worker.sh spark://gsomogyi-MBP16:7077 ./bin/spark-submit --master spark://gsomogyi-MBP16:7077 --deploy-mode client --class com.spark.Main ../spark-test/target/spark-test-1.0-SNAPSHOT-jar-with-dependencies.jar ``` Screenshot 2020-11-18 at 12 45 36 Closes #30336 from gaborgsomogyi/SPARK-33287. Authored-by: Gabor Somogyi Signed-off-by: Jungtaek Lim (HeartSaVioR) --- .../spark/sql/internal/StaticSQLConf.scala | 12 ++ .../ui/StreamingQueryStatisticsPage.scala | 143 +++++++++++++----- .../ui/StreamingQueryPageSuite.scala | 5 + .../sql/streaming/ui/UISeleniumSuite.scala | 6 + 4 files changed, 127 insertions(+), 39 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala index ca1074fcf6fc0..02cb6f29622f5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala @@ -249,4 +249,16 @@ object StaticSQLConf { .version("3.1.0") .timeConf(TimeUnit.SECONDS) .createWithDefault(-1) + + val ENABLED_STREAMING_UI_CUSTOM_METRIC_LIST = + buildStaticConf("spark.sql.streaming.ui.enabledCustomMetricList") + .internal() + .doc("Configures a list of custom metrics on Structured Streaming UI, which are enabled. " + + "The list contains the name of the custom metrics separated by comma. In aggregation" + + "only sum used. The list of supported custom metrics is state store provider specific " + + "and it can be found out for example from query progress log entry.") + .version("3.1.0") + .stringConf + .toSequence + .createWithDefault(Nil) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala index f48672afb41f3..77b1e61d587a7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala @@ -19,18 +19,32 @@ package org.apache.spark.sql.streaming.ui import java.{util => ju} import java.lang.{Long => JLong} -import java.util.UUID +import java.util.{Locale, UUID} import javax.servlet.http.HttpServletRequest +import scala.collection.JavaConverters._ import scala.xml.{Node, NodeBuffer, Unparsed} import org.apache.spark.internal.Logging +import org.apache.spark.sql.execution.streaming.state.StateStoreProvider +import org.apache.spark.sql.internal.SQLConf.STATE_STORE_PROVIDER_CLASS +import org.apache.spark.sql.internal.StaticSQLConf.ENABLED_STREAMING_UI_CUSTOM_METRIC_LIST import org.apache.spark.sql.streaming.ui.UIUtils._ import org.apache.spark.ui.{GraphUIData, JsCollector, UIUtils => SparkUIUtils, WebUIPage} private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab) extends WebUIPage("statistics") with Logging { + // State store provider implementation mustn't do any heavyweight initialiation in constructor + // but in its init method. + private val supportedCustomMetrics = StateStoreProvider.create( + parent.parent.conf.get(STATE_STORE_PROVIDER_CLASS)).supportedCustomMetrics + logDebug(s"Supported custom metrics: $supportedCustomMetrics") + + private val enabledCustomMetrics = + parent.parent.conf.get(ENABLED_STREAMING_UI_CUSTOM_METRIC_LIST).map(_.toLowerCase(Locale.ROOT)) + logDebug(s"Enabled custom metrics: $enabledCustomMetrics") + def generateLoadResources(request: HttpServletRequest): Seq[Node] = { // scalastyle:off @@ -199,49 +213,100 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab) "records") graphUIDataForNumRowsDroppedByWatermark.generateDataJs(jsCollector) - // scalastyle:off - - -
-
Aggregated Number Of Total State Rows {SparkUIUtils.tooltip("Aggregated number of total state rows.", "right")}
-
- - {graphUIDataForNumberTotalRows.generateTimelineHtml(jsCollector)} - {graphUIDataForNumberTotalRows.generateHistogramHtml(jsCollector)} - - - -
-
Aggregated Number Of Updated State Rows {SparkUIUtils.tooltip("Aggregated number of updated state rows.", "right")}
-
- - {graphUIDataForNumberUpdatedRows.generateTimelineHtml(jsCollector)} - {graphUIDataForNumberUpdatedRows.generateHistogramHtml(jsCollector)} - - - -
-
Aggregated State Memory Used In Bytes {SparkUIUtils.tooltip("Aggregated state memory used in bytes.", "right")}
-
- - {graphUIDataForMemoryUsedBytes.generateTimelineHtml(jsCollector)} - {graphUIDataForMemoryUsedBytes.generateHistogramHtml(jsCollector)} - - - -
-
Aggregated Number Of Rows Dropped By Watermark {SparkUIUtils.tooltip("Accumulates all input rows being dropped in stateful operators by watermark. 'Inputs' are relative to operators.", "right")}
-
- - {graphUIDataForNumRowsDroppedByWatermark.generateTimelineHtml(jsCollector)} - {graphUIDataForNumRowsDroppedByWatermark.generateHistogramHtml(jsCollector)} - - // scalastyle:on + val result = + // scalastyle:off + + +
+
Aggregated Number Of Total State Rows {SparkUIUtils.tooltip("Aggregated number of total state rows.", "right")}
+
+ + {graphUIDataForNumberTotalRows.generateTimelineHtml(jsCollector)} + {graphUIDataForNumberTotalRows.generateHistogramHtml(jsCollector)} + + + +
+
Aggregated Number Of Updated State Rows {SparkUIUtils.tooltip("Aggregated number of updated state rows.", "right")}
+
+ + {graphUIDataForNumberUpdatedRows.generateTimelineHtml(jsCollector)} + {graphUIDataForNumberUpdatedRows.generateHistogramHtml(jsCollector)} + + + +
+
Aggregated State Memory Used In Bytes {SparkUIUtils.tooltip("Aggregated state memory used in bytes.", "right")}
+
+ + {graphUIDataForMemoryUsedBytes.generateTimelineHtml(jsCollector)} + {graphUIDataForMemoryUsedBytes.generateHistogramHtml(jsCollector)} + + + +
+
Aggregated Number Of Rows Dropped By Watermark {SparkUIUtils.tooltip("Accumulates all input rows being dropped in stateful operators by watermark. 'Inputs' are relative to operators.", "right")}
+
+ + {graphUIDataForNumRowsDroppedByWatermark.generateTimelineHtml(jsCollector)} + {graphUIDataForNumRowsDroppedByWatermark.generateHistogramHtml(jsCollector)} + + // scalastyle:on + + if (enabledCustomMetrics.nonEmpty) { + result ++= generateAggregatedCustomMetrics(query, minBatchTime, maxBatchTime, jsCollector) + } + result } else { new NodeBuffer() } } + def generateAggregatedCustomMetrics( + query: StreamingQueryUIData, + minBatchTime: Long, + maxBatchTime: Long, + jsCollector: JsCollector): NodeBuffer = { + val result: NodeBuffer = new NodeBuffer + + // This is made sure on caller side but put it here to be defensive + require(query.lastProgress.stateOperators.nonEmpty) + query.lastProgress.stateOperators.head.customMetrics.keySet().asScala + .filter(m => enabledCustomMetrics.contains(m.toLowerCase(Locale.ROOT))).map { metricName => + val data = query.recentProgress.map(p => (parseProgressTimestamp(p.timestamp), + p.stateOperators.map(_.customMetrics.get(metricName).toDouble).sum)) + val max = data.maxBy(_._2)._2 + val metric = supportedCustomMetrics.find(_.name.equalsIgnoreCase(metricName)).get + + val graphUIData = + new GraphUIData( + s"aggregated-$metricName-timeline", + s"aggregated-$metricName-histogram", + data, + minBatchTime, + maxBatchTime, + 0, + max, + "") + graphUIData.generateDataJs(jsCollector) + + result ++= + // scalastyle:off + + +
+
Aggregated Custom Metric {s"$metricName"} {SparkUIUtils.tooltip(metric.desc, "right")}
+
+ + {graphUIData.generateTimelineHtml(jsCollector)} + {graphUIData.generateHistogramHtml(jsCollector)} + + // scalastyle:on + } + + result + } + def generateStatTable(query: StreamingQueryUIData): Seq[Node] = { val batchToTimestamps = withNoProgress(query, query.recentProgress.map(p => (p.batchId, parseProgressTimestamp(p.timestamp))), diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPageSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPageSuite.scala index 640c21c52a146..c2b6688faf0e7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPageSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPageSuite.scala @@ -24,8 +24,10 @@ import org.mockito.Mockito.{mock, when, RETURNS_SMART_NULLS} import org.scalatest.BeforeAndAfter import scala.xml.Node +import org.apache.spark.SparkConf import org.apache.spark.sql.streaming.StreamingQueryProgress import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.ui.SparkUI class StreamingQueryPageSuite extends SharedSparkSession with BeforeAndAfter { @@ -65,10 +67,13 @@ class StreamingQueryPageSuite extends SharedSparkSession with BeforeAndAfter { val request = mock(classOf[HttpServletRequest]) val tab = mock(classOf[StreamingQueryTab], RETURNS_SMART_NULLS) val statusListener = mock(classOf[StreamingQueryStatusListener], RETURNS_SMART_NULLS) + val ui = mock(classOf[SparkUI]) when(request.getParameter("id")).thenReturn(id.toString) when(tab.appName).thenReturn("testing") when(tab.headerTabs).thenReturn(Seq.empty) when(tab.statusListener).thenReturn(statusListener) + when(ui.conf).thenReturn(new SparkConf()) + when(tab.parent).thenReturn(ui) val streamQuery = createStreamQueryUIData(id) when(statusListener.allQueryStatus).thenReturn(Seq(streamQuery)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/UISeleniumSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/UISeleniumSuite.scala index 307479db33949..94844c4e87a84 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/UISeleniumSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/UISeleniumSuite.scala @@ -31,6 +31,7 @@ import org.apache.spark.internal.config.UI.{UI_ENABLED, UI_PORT} import org.apache.spark.sql.LocalSparkSession.withSparkSession import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.util.quietly +import org.apache.spark.sql.internal.StaticSQLConf.ENABLED_STREAMING_UI_CUSTOM_METRIC_LIST import org.apache.spark.sql.streaming.StreamingQueryException import org.apache.spark.ui.SparkUICssErrorHandler @@ -53,6 +54,7 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B .setAppName("ui-test") .set(UI_ENABLED, true) .set(UI_PORT, 0) + .set(ENABLED_STREAMING_UI_CUSTOM_METRIC_LIST, Seq("stateOnCurrentVersionSizeBytes")) additionalConfs.foreach { case (k, v) => conf.set(k, v) } val spark = SparkSession.builder().master(master).config(conf).getOrCreate() assert(spark.sparkContext.ui.isDefined) @@ -140,6 +142,10 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B summaryText should contain ("Aggregated Number Of Updated State Rows (?)") summaryText should contain ("Aggregated State Memory Used In Bytes (?)") summaryText should contain ("Aggregated Number Of Rows Dropped By Watermark (?)") + summaryText should contain ("Aggregated Custom Metric stateOnCurrentVersionSizeBytes" + + " (?)") + summaryText should not contain ("Aggregated Custom Metric loadedMapCacheHitCount (?)") + summaryText should not contain ("Aggregated Custom Metric loadedMapCacheMissCount (?)") } } finally { spark.streams.active.foreach(_.stop()) From 665817bd4fc07b18cee0f8c6ff759288472514c2 Mon Sep 17 00:00:00 2001 From: zero323 Date: Wed, 25 Nov 2020 09:27:04 +0900 Subject: [PATCH 002/150] [SPARK-33457][PYTHON] Adjust mypy configuration ### What changes were proposed in this pull request? This pull request: - Adds following flags to the main mypy configuration: - [`strict_optional`](https://mypy.readthedocs.io/en/stable/config_file.html#confval-strict_optional) - [`no_implicit_optional`](https://mypy.readthedocs.io/en/stable/config_file.html#confval-no_implicit_optional) - [`disallow_untyped_defs`](https://mypy.readthedocs.io/en/stable/config_file.html#confval-disallow_untyped_calls) These flags are enabled only for public API and disabled for tests and internal modules. Additionally, these PR fixes missing annotations. ### Why are the changes needed? Primary reason to propose this changes is to use standard configuration as used by typeshed project. This will allow us to be more strict, especially when interacting with JVM code. See for example https://github.com/apache/spark/pull/29122#pullrequestreview-513112882 Additionally, it will allow us to detect cases where annotations have unintentionally omitted. ### Does this PR introduce _any_ user-facing change? Annotations only. ### How was this patch tested? `dev/lint-python`. Closes #30382 from zero323/SPARK-33457. Authored-by: zero323 Signed-off-by: HyukjinKwon --- python/mypy.ini | 87 +++++++++++++++++++++++ python/pyspark/broadcast.pyi | 10 +-- python/pyspark/context.pyi | 25 +++++-- python/pyspark/ml/classification.pyi | 6 +- python/pyspark/ml/common.pyi | 10 ++- python/pyspark/ml/evaluation.pyi | 24 ++++--- python/pyspark/ml/feature.pyi | 20 ++++-- python/pyspark/ml/linalg/__init__.pyi | 36 +++++----- python/pyspark/ml/pipeline.pyi | 4 +- python/pyspark/ml/regression.pyi | 10 +-- python/pyspark/mllib/classification.pyi | 2 +- python/pyspark/mllib/clustering.pyi | 6 +- python/pyspark/mllib/common.pyi | 20 ++++-- python/pyspark/mllib/linalg/__init__.pyi | 45 +++++++----- python/pyspark/mllib/random.pyi | 2 +- python/pyspark/mllib/recommendation.pyi | 4 +- python/pyspark/mllib/stat/_statistics.pyi | 2 +- python/pyspark/rdd.pyi | 8 ++- python/pyspark/resource/profile.pyi | 2 +- python/pyspark/sql/column.pyi | 8 ++- python/pyspark/sql/context.pyi | 6 +- python/pyspark/sql/functions.pyi | 8 ++- python/pyspark/sql/session.pyi | 10 ++- python/pyspark/sql/types.pyi | 15 ++-- python/pyspark/sql/udf.pyi | 7 +- python/pyspark/streaming/context.pyi | 2 +- python/pyspark/streaming/dstream.pyi | 10 ++- python/pyspark/streaming/kinesis.pyi | 2 +- 28 files changed, 277 insertions(+), 114 deletions(-) diff --git a/python/mypy.ini b/python/mypy.ini index 4a5368a519097..5103452a053be 100644 --- a/python/mypy.ini +++ b/python/mypy.ini @@ -16,10 +16,97 @@ ; [mypy] +strict_optional = True +no_implicit_optional = True +disallow_untyped_defs = True + +; Allow untyped def in internal modules and tests + +[mypy-pyspark.daemon] +disallow_untyped_defs = False + +[mypy-pyspark.find_spark_home] +disallow_untyped_defs = False + +[mypy-pyspark._globals] +disallow_untyped_defs = False + +[mypy-pyspark.install] +disallow_untyped_defs = False + +[mypy-pyspark.java_gateway] +disallow_untyped_defs = False + +[mypy-pyspark.join] +disallow_untyped_defs = False + +[mypy-pyspark.ml.tests.*] +disallow_untyped_defs = False + +[mypy-pyspark.mllib.tests.*] +disallow_untyped_defs = False + +[mypy-pyspark.rddsampler] +disallow_untyped_defs = False + +[mypy-pyspark.resource.tests.*] +disallow_untyped_defs = False + +[mypy-pyspark.serializers] +disallow_untyped_defs = False + +[mypy-pyspark.shuffle] +disallow_untyped_defs = False + +[mypy-pyspark.streaming.tests.*] +disallow_untyped_defs = False + +[mypy-pyspark.streaming.util] +disallow_untyped_defs = False + +[mypy-pyspark.sql.tests.*] +disallow_untyped_defs = False + +[mypy-pyspark.sql.pandas.serializers] +disallow_untyped_defs = False + +[mypy-pyspark.sql.pandas.types] +disallow_untyped_defs = False + +[mypy-pyspark.sql.pandas.typehints] +disallow_untyped_defs = False + +[mypy-pyspark.sql.pandas.utils] +disallow_untyped_defs = False + +[mypy-pyspark.sql.pandas._typing.protocols.*] +disallow_untyped_defs = False + +[mypy-pyspark.sql.utils] +disallow_untyped_defs = False + +[mypy-pyspark.tests.*] +disallow_untyped_defs = False + +[mypy-pyspark.testing.*] +disallow_untyped_defs = False + +[mypy-pyspark.traceback_utils] +disallow_untyped_defs = False + +[mypy-pyspark.util] +disallow_untyped_defs = False + +[mypy-pyspark.worker] +disallow_untyped_defs = False + +; Ignore errors in embedded third party code [mypy-pyspark.cloudpickle.*] ignore_errors = True +; Ignore missing imports for external untyped packages + [mypy-py4j.*] ignore_missing_imports = True diff --git a/python/pyspark/broadcast.pyi b/python/pyspark/broadcast.pyi index 4b019a509a003..944cb06d4178c 100644 --- a/python/pyspark/broadcast.pyi +++ b/python/pyspark/broadcast.pyi @@ -17,7 +17,7 @@ # under the License. import threading -from typing import Any, Dict, Generic, Optional, TypeVar +from typing import Any, Callable, Dict, Generic, Optional, Tuple, TypeVar T = TypeVar("T") @@ -32,14 +32,14 @@ class Broadcast(Generic[T]): path: Optional[Any] = ..., sock_file: Optional[Any] = ..., ) -> None: ... - def dump(self, value: Any, f: Any) -> None: ... - def load_from_path(self, path: Any): ... - def load(self, file: Any): ... + def dump(self, value: T, f: Any) -> None: ... + def load_from_path(self, path: Any) -> T: ... + def load(self, file: Any) -> T: ... @property def value(self) -> T: ... def unpersist(self, blocking: bool = ...) -> None: ... def destroy(self, blocking: bool = ...) -> None: ... - def __reduce__(self): ... + def __reduce__(self) -> Tuple[Callable[[int], T], Tuple[int]]: ... class BroadcastPickleRegistry(threading.local): def __init__(self) -> None: ... diff --git a/python/pyspark/context.pyi b/python/pyspark/context.pyi index 2789a38b3be9f..640a69cad08ab 100644 --- a/python/pyspark/context.pyi +++ b/python/pyspark/context.pyi @@ -16,7 +16,19 @@ # specific language governing permissions and limitations # under the License. -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, TypeVar +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + NoReturn, + Optional, + Tuple, + Type, + TypeVar, +) +from types import TracebackType from py4j.java_gateway import JavaGateway, JavaObject # type: ignore[import] @@ -51,9 +63,14 @@ class SparkContext: jsc: Optional[JavaObject] = ..., profiler_cls: type = ..., ) -> None: ... - def __getnewargs__(self): ... - def __enter__(self): ... - def __exit__(self, type, value, trace): ... + def __getnewargs__(self) -> NoReturn: ... + def __enter__(self) -> SparkContext: ... + def __exit__( + self, + type: Optional[Type[BaseException]], + value: Optional[BaseException], + trace: Optional[TracebackType], + ) -> None: ... @classmethod def getOrCreate(cls, conf: Optional[SparkConf] = ...) -> SparkContext: ... def setLogLevel(self, logLevel: str) -> None: ... diff --git a/python/pyspark/ml/classification.pyi b/python/pyspark/ml/classification.pyi index 4bde851bb1e0d..c44176a13a69b 100644 --- a/python/pyspark/ml/classification.pyi +++ b/python/pyspark/ml/classification.pyi @@ -107,7 +107,7 @@ class _JavaProbabilisticClassifier( class _JavaProbabilisticClassificationModel( ProbabilisticClassificationModel, _JavaClassificationModel[T] ): - def predictProbability(self, value: Any): ... + def predictProbability(self, value: Vector) -> Vector: ... class _ClassificationSummary(JavaWrapper): @property @@ -543,7 +543,7 @@ class RandomForestClassificationModel( @property def trees(self) -> List[DecisionTreeClassificationModel]: ... def summary(self) -> RandomForestClassificationTrainingSummary: ... - def evaluate(self, dataset) -> RandomForestClassificationSummary: ... + def evaluate(self, dataset: DataFrame) -> RandomForestClassificationSummary: ... class RandomForestClassificationSummary(_ClassificationSummary): ... class RandomForestClassificationTrainingSummary( @@ -891,7 +891,7 @@ class FMClassifier( solver: str = ..., thresholds: Optional[Any] = ..., seed: Optional[Any] = ..., - ): ... + ) -> FMClassifier: ... def setFactorSize(self, value: int) -> FMClassifier: ... def setFitLinear(self, value: bool) -> FMClassifier: ... def setMiniBatchFraction(self, value: float) -> FMClassifier: ... diff --git a/python/pyspark/ml/common.pyi b/python/pyspark/ml/common.pyi index 7bf0ed6183d8a..a38fc5734f466 100644 --- a/python/pyspark/ml/common.pyi +++ b/python/pyspark/ml/common.pyi @@ -16,5 +16,11 @@ # specific language governing permissions and limitations # under the License. -def callJavaFunc(sc, func, *args): ... -def inherit_doc(cls): ... +from typing import Any, TypeVar + +import pyspark.context + +C = TypeVar("C", bound=type) + +def callJavaFunc(sc: pyspark.context.SparkContext, func: Any, *args: Any) -> Any: ... +def inherit_doc(cls: C) -> C: ... diff --git a/python/pyspark/ml/evaluation.pyi b/python/pyspark/ml/evaluation.pyi index ea0a9f045cd6a..55a3ae2774115 100644 --- a/python/pyspark/ml/evaluation.pyi +++ b/python/pyspark/ml/evaluation.pyi @@ -39,9 +39,12 @@ from pyspark.ml.param.shared import ( HasWeightCol, ) from pyspark.ml.util import JavaMLReadable, JavaMLWritable +from pyspark.sql.dataframe import DataFrame class Evaluator(Params, metaclass=abc.ABCMeta): - def evaluate(self, dataset, params: Optional[ParamMap] = ...) -> float: ... + def evaluate( + self, dataset: DataFrame, params: Optional[ParamMap] = ... + ) -> float: ... def isLargerBetter(self) -> bool: ... class JavaEvaluator(JavaParams, Evaluator, metaclass=abc.ABCMeta): @@ -75,16 +78,15 @@ class BinaryClassificationEvaluator( def setLabelCol(self, value: str) -> BinaryClassificationEvaluator: ... def setRawPredictionCol(self, value: str) -> BinaryClassificationEvaluator: ... def setWeightCol(self, value: str) -> BinaryClassificationEvaluator: ... - -def setParams( - self, - *, - rawPredictionCol: str = ..., - labelCol: str = ..., - metricName: BinaryClassificationEvaluatorMetricType = ..., - weightCol: Optional[str] = ..., - numBins: int = ... -) -> BinaryClassificationEvaluator: ... + def setParams( + self, + *, + rawPredictionCol: str = ..., + labelCol: str = ..., + metricName: BinaryClassificationEvaluatorMetricType = ..., + weightCol: Optional[str] = ..., + numBins: int = ... + ) -> BinaryClassificationEvaluator: ... class RegressionEvaluator( JavaEvaluator, diff --git a/python/pyspark/ml/feature.pyi b/python/pyspark/ml/feature.pyi index f5b12a5b2ffc6..4999defdf8a70 100644 --- a/python/pyspark/ml/feature.pyi +++ b/python/pyspark/ml/feature.pyi @@ -100,9 +100,9 @@ class _LSHParams(HasInputCol, HasOutputCol): def getNumHashTables(self) -> int: ... class _LSH(Generic[JM], JavaEstimator[JM], _LSHParams, JavaMLReadable, JavaMLWritable): - def setNumHashTables(self: P, value) -> P: ... - def setInputCol(self: P, value) -> P: ... - def setOutputCol(self: P, value) -> P: ... + def setNumHashTables(self: P, value: int) -> P: ... + def setInputCol(self: P, value: str) -> P: ... + def setOutputCol(self: P, value: str) -> P: ... class _LSHModel(JavaModel, _LSHParams): def setInputCol(self: P, value: str) -> P: ... @@ -1518,7 +1518,7 @@ class ChiSqSelector( fpr: float = ..., fdr: float = ..., fwe: float = ... - ): ... + ) -> ChiSqSelector: ... def setSelectorType(self, value: str) -> ChiSqSelector: ... def setNumTopFeatures(self, value: int) -> ChiSqSelector: ... def setPercentile(self, value: float) -> ChiSqSelector: ... @@ -1602,7 +1602,10 @@ class _VarianceThresholdSelectorParams(HasFeaturesCol, HasOutputCol): def getVarianceThreshold(self) -> float: ... class VarianceThresholdSelector( - JavaEstimator, _VarianceThresholdSelectorParams, JavaMLReadable, JavaMLWritable + JavaEstimator[VarianceThresholdSelectorModel], + _VarianceThresholdSelectorParams, + JavaMLReadable[VarianceThresholdSelector], + JavaMLWritable, ): def __init__( self, @@ -1615,13 +1618,16 @@ class VarianceThresholdSelector( featuresCol: str = ..., outputCol: Optional[str] = ..., varianceThreshold: float = ..., - ): ... + ) -> VarianceThresholdSelector: ... def setVarianceThreshold(self, value: float) -> VarianceThresholdSelector: ... def setFeaturesCol(self, value: str) -> VarianceThresholdSelector: ... def setOutputCol(self, value: str) -> VarianceThresholdSelector: ... class VarianceThresholdSelectorModel( - JavaModel, _VarianceThresholdSelectorParams, JavaMLReadable, JavaMLWritable + JavaModel, + _VarianceThresholdSelectorParams, + JavaMLReadable[VarianceThresholdSelectorModel], + JavaMLWritable, ): def setFeaturesCol(self, value: str) -> VarianceThresholdSelectorModel: ... def setOutputCol(self, value: str) -> VarianceThresholdSelectorModel: ... diff --git a/python/pyspark/ml/linalg/__init__.pyi b/python/pyspark/ml/linalg/__init__.pyi index a576b30aec308..b4fba8823b678 100644 --- a/python/pyspark/ml/linalg/__init__.pyi +++ b/python/pyspark/ml/linalg/__init__.pyi @@ -17,7 +17,7 @@ # under the License. from typing import overload -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, NoReturn, Optional, Tuple, Type, Union from pyspark.ml import linalg as newlinalg # noqa: F401 from pyspark.sql.types import StructType, UserDefinedType @@ -45,7 +45,7 @@ class MatrixUDT(UserDefinedType): @classmethod def scalaUDT(cls) -> str: ... def serialize( - self, obj + self, obj: Matrix ) -> Tuple[ int, int, int, Optional[List[int]], Optional[List[int]], List[float], bool ]: ... @@ -64,9 +64,7 @@ class DenseVector(Vector): def __init__(self, __arr: bytes) -> None: ... @overload def __init__(self, __arr: Iterable[float]) -> None: ... - @staticmethod - def parse(s) -> DenseVector: ... - def __reduce__(self) -> Tuple[type, bytes]: ... + def __reduce__(self) -> Tuple[Type[DenseVector], bytes]: ... def numNonzeros(self) -> int: ... def norm(self, p: Union[float, str]) -> float64: ... def dot(self, other: Iterable[float]) -> float64: ... @@ -112,16 +110,14 @@ class SparseVector(Vector): def __init__(self, size: int, __map: Dict[int, float]) -> None: ... def numNonzeros(self) -> int: ... def norm(self, p: Union[float, str]) -> float64: ... - def __reduce__(self): ... - @staticmethod - def parse(s: str) -> SparseVector: ... + def __reduce__(self) -> Tuple[Type[SparseVector], Tuple[int, bytes, bytes]]: ... def dot(self, other: Iterable[float]) -> float64: ... def squared_distance(self, other: Iterable[float]) -> float64: ... def toArray(self) -> ndarray: ... def __len__(self) -> int: ... - def __eq__(self, other) -> bool: ... + def __eq__(self, other: Any) -> bool: ... def __getitem__(self, index: int) -> float64: ... - def __ne__(self, other) -> bool: ... + def __ne__(self, other: Any) -> bool: ... def __hash__(self) -> int: ... class Vectors: @@ -144,13 +140,13 @@ class Vectors: def sparse(size: int, __map: Dict[int, float]) -> SparseVector: ... @overload @staticmethod - def dense(self, *elements: float) -> DenseVector: ... + def dense(*elements: float) -> DenseVector: ... @overload @staticmethod - def dense(self, __arr: bytes) -> DenseVector: ... + def dense(__arr: bytes) -> DenseVector: ... @overload @staticmethod - def dense(self, __arr: Iterable[float]) -> DenseVector: ... + def dense(__arr: Iterable[float]) -> DenseVector: ... @staticmethod def stringify(vector: Vector) -> str: ... @staticmethod @@ -158,8 +154,6 @@ class Vectors: @staticmethod def norm(vector: Vector, p: Union[float, str]) -> float64: ... @staticmethod - def parse(s: str) -> Vector: ... - @staticmethod def zeros(size: int) -> DenseVector: ... class Matrix: @@ -170,7 +164,7 @@ class Matrix: def __init__( self, numRows: int, numCols: int, isTransposed: bool = ... ) -> None: ... - def toArray(self): ... + def toArray(self) -> NoReturn: ... class DenseMatrix(Matrix): values: Any @@ -186,11 +180,11 @@ class DenseMatrix(Matrix): values: Iterable[float], isTransposed: bool = ..., ) -> None: ... - def __reduce__(self) -> Tuple[type, Tuple[int, int, bytes, int]]: ... + def __reduce__(self) -> Tuple[Type[DenseMatrix], Tuple[int, int, bytes, int]]: ... def toArray(self) -> ndarray: ... def toSparse(self) -> SparseMatrix: ... def __getitem__(self, indices: Tuple[int, int]) -> float64: ... - def __eq__(self, other) -> bool: ... + def __eq__(self, other: Any) -> bool: ... class SparseMatrix(Matrix): colPtrs: ndarray @@ -216,11 +210,13 @@ class SparseMatrix(Matrix): values: Iterable[float], isTransposed: bool = ..., ) -> None: ... - def __reduce__(self) -> Tuple[type, Tuple[int, int, bytes, bytes, bytes, int]]: ... + def __reduce__( + self, + ) -> Tuple[Type[SparseMatrix], Tuple[int, int, bytes, bytes, bytes, int]]: ... def __getitem__(self, indices: Tuple[int, int]) -> float64: ... def toArray(self) -> ndarray: ... def toDense(self) -> DenseMatrix: ... - def __eq__(self, other) -> bool: ... + def __eq__(self, other: Any) -> bool: ... class Matrices: @overload diff --git a/python/pyspark/ml/pipeline.pyi b/python/pyspark/ml/pipeline.pyi index 44680586d70d1..f47e9e012ae14 100644 --- a/python/pyspark/ml/pipeline.pyi +++ b/python/pyspark/ml/pipeline.pyi @@ -51,7 +51,7 @@ class PipelineWriter(MLWriter): def __init__(self, instance: Pipeline) -> None: ... def saveImpl(self, path: str) -> None: ... -class PipelineReader(MLReader): +class PipelineReader(MLReader[Pipeline]): cls: Type[Pipeline] def __init__(self, cls: Type[Pipeline]) -> None: ... def load(self, path: str) -> Pipeline: ... @@ -61,7 +61,7 @@ class PipelineModelWriter(MLWriter): def __init__(self, instance: PipelineModel) -> None: ... def saveImpl(self, path: str) -> None: ... -class PipelineModelReader(MLReader): +class PipelineModelReader(MLReader[PipelineModel]): cls: Type[PipelineModel] def __init__(self, cls: Type[PipelineModel]) -> None: ... def load(self, path: str) -> PipelineModel: ... diff --git a/python/pyspark/ml/regression.pyi b/python/pyspark/ml/regression.pyi index 5cb0e7a5092f7..b8f1e61859c72 100644 --- a/python/pyspark/ml/regression.pyi +++ b/python/pyspark/ml/regression.pyi @@ -414,7 +414,7 @@ class RandomForestRegressionModel( _TreeEnsembleModel, _RandomForestRegressorParams, JavaMLWritable, - JavaMLReadable, + JavaMLReadable[RandomForestRegressionModel], ): @property def trees(self) -> List[DecisionTreeRegressionModel]: ... @@ -749,10 +749,10 @@ class _FactorizationMachinesParams( initStd: Param[float] solver: Param[str] def __init__(self, *args: Any): ... - def getFactorSize(self): ... - def getFitLinear(self): ... - def getMiniBatchFraction(self): ... - def getInitStd(self): ... + def getFactorSize(self) -> int: ... + def getFitLinear(self) -> bool: ... + def getMiniBatchFraction(self) -> float: ... + def getInitStd(self) -> float: ... class FMRegressor( _JavaRegressor[FMRegressionModel], diff --git a/python/pyspark/mllib/classification.pyi b/python/pyspark/mllib/classification.pyi index c51882c87bfc2..967b0a9f289dd 100644 --- a/python/pyspark/mllib/classification.pyi +++ b/python/pyspark/mllib/classification.pyi @@ -118,7 +118,7 @@ class NaiveBayesModel(Saveable, Loader[NaiveBayesModel]): labels: ndarray pi: ndarray theta: ndarray - def __init__(self, labels, pi, theta) -> None: ... + def __init__(self, labels: ndarray, pi: ndarray, theta: ndarray) -> None: ... @overload def predict(self, x: VectorLike) -> float64: ... @overload diff --git a/python/pyspark/mllib/clustering.pyi b/python/pyspark/mllib/clustering.pyi index 1c3eba17e201c..b4f349612f0fe 100644 --- a/python/pyspark/mllib/clustering.pyi +++ b/python/pyspark/mllib/clustering.pyi @@ -63,7 +63,7 @@ class BisectingKMeans: class KMeansModel(Saveable, Loader[KMeansModel]): centers: List[ndarray] - def __init__(self, centers: List[ndarray]) -> None: ... + def __init__(self, centers: List[VectorLike]) -> None: ... @property def clusterCenters(self) -> List[ndarray]: ... @property @@ -144,7 +144,9 @@ class PowerIterationClustering: class Assignment(NamedTuple("Assignment", [("id", int), ("cluster", int)])): ... class StreamingKMeansModel(KMeansModel): - def __init__(self, clusterCenters, clusterWeights) -> None: ... + def __init__( + self, clusterCenters: List[VectorLike], clusterWeights: VectorLike + ) -> None: ... @property def clusterWeights(self) -> List[float64]: ... centers: ndarray diff --git a/python/pyspark/mllib/common.pyi b/python/pyspark/mllib/common.pyi index 1df308b91b5a1..daba212d93633 100644 --- a/python/pyspark/mllib/common.pyi +++ b/python/pyspark/mllib/common.pyi @@ -16,12 +16,20 @@ # specific language governing permissions and limitations # under the License. -def callJavaFunc(sc, func, *args): ... -def callMLlibFunc(name, *args): ... +from typing import Any, TypeVar + +import pyspark.context + +from py4j.java_gateway import JavaObject + +C = TypeVar("C", bound=type) + +def callJavaFunc(sc: pyspark.context.SparkContext, func: Any, *args: Any) -> Any: ... +def callMLlibFunc(name: str, *args: Any) -> Any: ... class JavaModelWrapper: - def __init__(self, java_model) -> None: ... - def __del__(self): ... - def call(self, name, *a): ... + def __init__(self, java_model: JavaObject) -> None: ... + def __del__(self) -> None: ... + def call(self, name: str, *a: Any) -> Any: ... -def inherit_doc(cls): ... +def inherit_doc(cls: C) -> C: ... diff --git a/python/pyspark/mllib/linalg/__init__.pyi b/python/pyspark/mllib/linalg/__init__.pyi index c0719c535c8f4..60d16b26f3590 100644 --- a/python/pyspark/mllib/linalg/__init__.pyi +++ b/python/pyspark/mllib/linalg/__init__.pyi @@ -17,7 +17,18 @@ # under the License. from typing import overload -from typing import Any, Dict, Generic, Iterable, List, Optional, Tuple, TypeVar, Union +from typing import ( + Any, + Dict, + Generic, + Iterable, + List, + Optional, + Tuple, + Type, + TypeVar, + Union, +) from pyspark.ml import linalg as newlinalg from pyspark.sql.types import StructType, UserDefinedType from numpy import float64, ndarray # type: ignore[import] @@ -46,7 +57,7 @@ class MatrixUDT(UserDefinedType): @classmethod def scalaUDT(cls) -> str: ... def serialize( - self, obj + self, obj: Matrix ) -> Tuple[ int, int, int, Optional[List[int]], Optional[List[int]], List[float], bool ]: ... @@ -67,8 +78,8 @@ class DenseVector(Vector): @overload def __init__(self, __arr: Iterable[float]) -> None: ... @staticmethod - def parse(s) -> DenseVector: ... - def __reduce__(self) -> Tuple[type, bytes]: ... + def parse(s: str) -> DenseVector: ... + def __reduce__(self) -> Tuple[Type[DenseVector], bytes]: ... def numNonzeros(self) -> int: ... def norm(self, p: Union[float, str]) -> float64: ... def dot(self, other: Iterable[float]) -> float64: ... @@ -115,7 +126,7 @@ class SparseVector(Vector): def __init__(self, size: int, __map: Dict[int, float]) -> None: ... def numNonzeros(self) -> int: ... def norm(self, p: Union[float, str]) -> float64: ... - def __reduce__(self): ... + def __reduce__(self) -> Tuple[Type[SparseVector], Tuple[int, bytes, bytes]]: ... @staticmethod def parse(s: str) -> SparseVector: ... def dot(self, other: Iterable[float]) -> float64: ... @@ -123,9 +134,9 @@ class SparseVector(Vector): def toArray(self) -> ndarray: ... def asML(self) -> newlinalg.SparseVector: ... def __len__(self) -> int: ... - def __eq__(self, other) -> bool: ... + def __eq__(self, other: Any) -> bool: ... def __getitem__(self, index: int) -> float64: ... - def __ne__(self, other) -> bool: ... + def __ne__(self, other: Any) -> bool: ... def __hash__(self) -> int: ... class Vectors: @@ -148,13 +159,13 @@ class Vectors: def sparse(size: int, __map: Dict[int, float]) -> SparseVector: ... @overload @staticmethod - def dense(self, *elements: float) -> DenseVector: ... + def dense(*elements: float) -> DenseVector: ... @overload @staticmethod - def dense(self, __arr: bytes) -> DenseVector: ... + def dense(__arr: bytes) -> DenseVector: ... @overload @staticmethod - def dense(self, __arr: Iterable[float]) -> DenseVector: ... + def dense(__arr: Iterable[float]) -> DenseVector: ... @staticmethod def fromML(vec: newlinalg.DenseVector) -> DenseVector: ... @staticmethod @@ -176,8 +187,8 @@ class Matrix: def __init__( self, numRows: int, numCols: int, isTransposed: bool = ... ) -> None: ... - def toArray(self): ... - def asML(self): ... + def toArray(self) -> ndarray: ... + def asML(self) -> newlinalg.Matrix: ... class DenseMatrix(Matrix): values: Any @@ -193,12 +204,12 @@ class DenseMatrix(Matrix): values: Iterable[float], isTransposed: bool = ..., ) -> None: ... - def __reduce__(self) -> Tuple[type, Tuple[int, int, bytes, int]]: ... + def __reduce__(self) -> Tuple[Type[DenseMatrix], Tuple[int, int, bytes, int]]: ... def toArray(self) -> ndarray: ... def toSparse(self) -> SparseMatrix: ... def asML(self) -> newlinalg.DenseMatrix: ... def __getitem__(self, indices: Tuple[int, int]) -> float64: ... - def __eq__(self, other) -> bool: ... + def __eq__(self, other: Any) -> bool: ... class SparseMatrix(Matrix): colPtrs: ndarray @@ -224,12 +235,14 @@ class SparseMatrix(Matrix): values: Iterable[float], isTransposed: bool = ..., ) -> None: ... - def __reduce__(self) -> Tuple[type, Tuple[int, int, bytes, bytes, bytes, int]]: ... + def __reduce__( + self, + ) -> Tuple[Type[SparseMatrix], Tuple[int, int, bytes, bytes, bytes, int]]: ... def __getitem__(self, indices: Tuple[int, int]) -> float64: ... def toArray(self) -> ndarray: ... def toDense(self) -> DenseMatrix: ... def asML(self) -> newlinalg.SparseMatrix: ... - def __eq__(self, other) -> bool: ... + def __eq__(self, other: Any) -> bool: ... class Matrices: @overload diff --git a/python/pyspark/mllib/random.pyi b/python/pyspark/mllib/random.pyi index dc5f4701614da..ec83170625c74 100644 --- a/python/pyspark/mllib/random.pyi +++ b/python/pyspark/mllib/random.pyi @@ -90,7 +90,7 @@ class RandomRDDs: def logNormalVectorRDD( sc: SparkContext, mean: float, - std, + std: float, numRows: int, numCols: int, numPartitions: Optional[int] = ..., diff --git a/python/pyspark/mllib/recommendation.pyi b/python/pyspark/mllib/recommendation.pyi index e2f15494209e9..4fea0acf3c1f9 100644 --- a/python/pyspark/mllib/recommendation.pyi +++ b/python/pyspark/mllib/recommendation.pyi @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Type, Union import array from collections import namedtuple @@ -27,7 +27,7 @@ from pyspark.mllib.common import JavaModelWrapper from pyspark.mllib.util import JavaLoader, JavaSaveable class Rating(namedtuple("Rating", ["user", "product", "rating"])): - def __reduce__(self): ... + def __reduce__(self) -> Tuple[Type[Rating], Tuple[int, int, float]]: ... class MatrixFactorizationModel( JavaModelWrapper, JavaSaveable, JavaLoader[MatrixFactorizationModel] diff --git a/python/pyspark/mllib/stat/_statistics.pyi b/python/pyspark/mllib/stat/_statistics.pyi index 4d2701d486881..3834d51639eb2 100644 --- a/python/pyspark/mllib/stat/_statistics.pyi +++ b/python/pyspark/mllib/stat/_statistics.pyi @@ -65,5 +65,5 @@ class Statistics: def chiSqTest(observed: RDD[LabeledPoint]) -> List[ChiSqTestResult]: ... @staticmethod def kolmogorovSmirnovTest( - data, distName: Literal["norm"] = ..., *params: float + data: RDD[float], distName: Literal["norm"] = ..., *params: float ) -> KolmogorovSmirnovTestResult: ... diff --git a/python/pyspark/rdd.pyi b/python/pyspark/rdd.pyi index 35c49e952b0cd..a277cd9f7edae 100644 --- a/python/pyspark/rdd.pyi +++ b/python/pyspark/rdd.pyi @@ -85,12 +85,16 @@ class PythonEvalType: SQL_COGROUPED_MAP_PANDAS_UDF: PandasCogroupedMapUDFType class BoundedFloat(float): - def __new__(cls, mean: float, confidence: float, low: float, high: float): ... + def __new__( + cls, mean: float, confidence: float, low: float, high: float + ) -> BoundedFloat: ... class Partitioner: numPartitions: int partitionFunc: Callable[[Any], int] - def __init__(self, numPartitions, partitionFunc) -> None: ... + def __init__( + self, numPartitions: int, partitionFunc: Callable[[Any], int] + ) -> None: ... def __eq__(self, other: Any) -> bool: ... def __call__(self, k: Any) -> int: ... diff --git a/python/pyspark/resource/profile.pyi b/python/pyspark/resource/profile.pyi index 6763baf6590a3..04838692436df 100644 --- a/python/pyspark/resource/profile.pyi +++ b/python/pyspark/resource/profile.pyi @@ -49,7 +49,7 @@ class ResourceProfileBuilder: def __init__(self) -> None: ... def require( self, resourceRequest: Union[ExecutorResourceRequest, TaskResourceRequests] - ): ... + ) -> ResourceProfileBuilder: ... def clearExecutorResourceRequests(self) -> None: ... def clearTaskResourceRequests(self) -> None: ... @property diff --git a/python/pyspark/sql/column.pyi b/python/pyspark/sql/column.pyi index 0fbb10053fdbf..1f63e65b3de81 100644 --- a/python/pyspark/sql/column.pyi +++ b/python/pyspark/sql/column.pyi @@ -32,7 +32,7 @@ from pyspark.sql.window import WindowSpec from py4j.java_gateway import JavaObject # type: ignore[import] class Column: - def __init__(self, JavaObject) -> None: ... + def __init__(self, jc: JavaObject) -> None: ... def __neg__(self) -> Column: ... def __add__(self, other: Union[Column, LiteralType, DecimalLiteral]) -> Column: ... def __sub__(self, other: Union[Column, LiteralType, DecimalLiteral]) -> Column: ... @@ -105,7 +105,11 @@ class Column: def name(self, *alias: str) -> Column: ... def cast(self, dataType: Union[DataType, str]) -> Column: ... def astype(self, dataType: Union[DataType, str]) -> Column: ... - def between(self, lowerBound, upperBound) -> Column: ... + def between( + self, + lowerBound: Union[Column, LiteralType, DateTimeLiteral, DecimalLiteral], + upperBound: Union[Column, LiteralType, DateTimeLiteral, DecimalLiteral], + ) -> Column: ... def when(self, condition: Column, value: Any) -> Column: ... def otherwise(self, value: Any) -> Column: ... def over(self, window: WindowSpec) -> Column: ... diff --git a/python/pyspark/sql/context.pyi b/python/pyspark/sql/context.pyi index 64927b37ac2a9..915a0fe1f6709 100644 --- a/python/pyspark/sql/context.pyi +++ b/python/pyspark/sql/context.pyi @@ -43,14 +43,14 @@ class SQLContext: sparkSession: SparkSession def __init__( self, - sparkContext, + sparkContext: SparkContext, sparkSession: Optional[SparkSession] = ..., jsqlContext: Optional[JavaObject] = ..., ) -> None: ... @classmethod def getOrCreate(cls: type, sc: SparkContext) -> SQLContext: ... def newSession(self) -> SQLContext: ... - def setConf(self, key: str, value) -> None: ... + def setConf(self, key: str, value: Union[bool, int, str]) -> None: ... def getConf(self, key: str, defaultValue: Optional[str] = ...) -> str: ... @property def udf(self) -> UDFRegistration: ... @@ -116,7 +116,7 @@ class SQLContext: path: Optional[str] = ..., source: Optional[str] = ..., schema: Optional[StructType] = ..., - **options + **options: str ) -> DataFrame: ... def sql(self, sqlQuery: str) -> DataFrame: ... def table(self, tableName: str) -> DataFrame: ... diff --git a/python/pyspark/sql/functions.pyi b/python/pyspark/sql/functions.pyi index 281c1d75436c6..252f883b5fb09 100644 --- a/python/pyspark/sql/functions.pyi +++ b/python/pyspark/sql/functions.pyi @@ -65,13 +65,13 @@ def round(col: ColumnOrName, scale: int = ...) -> Column: ... def bround(col: ColumnOrName, scale: int = ...) -> Column: ... def shiftLeft(col: ColumnOrName, numBits: int) -> Column: ... def shiftRight(col: ColumnOrName, numBits: int) -> Column: ... -def shiftRightUnsigned(col, numBits) -> Column: ... +def shiftRightUnsigned(col: ColumnOrName, numBits: int) -> Column: ... def spark_partition_id() -> Column: ... def expr(str: str) -> Column: ... def struct(*cols: ColumnOrName) -> Column: ... def greatest(*cols: ColumnOrName) -> Column: ... def least(*cols: Column) -> Column: ... -def when(condition: Column, value) -> Column: ... +def when(condition: Column, value: Any) -> Column: ... @overload def log(arg1: ColumnOrName) -> Column: ... @overload @@ -174,7 +174,9 @@ def create_map(*cols: ColumnOrName) -> Column: ... def array(*cols: ColumnOrName) -> Column: ... def array_contains(col: ColumnOrName, value: Any) -> Column: ... def arrays_overlap(a1: ColumnOrName, a2: ColumnOrName) -> Column: ... -def slice(x: ColumnOrName, start: Union[Column, int], length: Union[Column, int]) -> Column: ... +def slice( + x: ColumnOrName, start: Union[Column, int], length: Union[Column, int] +) -> Column: ... def array_join( col: ColumnOrName, delimiter: str, null_replacement: Optional[str] = ... ) -> Column: ... diff --git a/python/pyspark/sql/session.pyi b/python/pyspark/sql/session.pyi index 17ba8894c1731..6cd2d3bed2b2f 100644 --- a/python/pyspark/sql/session.pyi +++ b/python/pyspark/sql/session.pyi @@ -17,7 +17,8 @@ # under the License. from typing import overload -from typing import Any, Iterable, List, Optional, Tuple, TypeVar, Union +from typing import Any, Iterable, List, Optional, Tuple, Type, TypeVar, Union +from types import TracebackType from py4j.java_gateway import JavaObject # type: ignore[import] @@ -122,4 +123,9 @@ class SparkSession(SparkConversionMixin): def streams(self) -> StreamingQueryManager: ... def stop(self) -> None: ... def __enter__(self) -> SparkSession: ... - def __exit__(self, exc_type, exc_val, exc_tb) -> None: ... + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[TracebackType], + ) -> None: ... diff --git a/python/pyspark/sql/types.pyi b/python/pyspark/sql/types.pyi index 31765e94884d7..3adf823d99a82 100644 --- a/python/pyspark/sql/types.pyi +++ b/python/pyspark/sql/types.pyi @@ -17,7 +17,8 @@ # under the License. from typing import overload -from typing import Any, Callable, Dict, Iterator, List, Optional, Union, Tuple, TypeVar +from typing import Any, Callable, Dict, Iterator, List, Optional, Union, Tuple, Type, TypeVar +from py4j.java_gateway import JavaGateway, JavaObject import datetime T = TypeVar("T") @@ -37,7 +38,7 @@ class DataType: def fromInternal(self, obj: Any) -> Any: ... class DataTypeSingleton(type): - def __call__(cls): ... + def __call__(cls: Type[T]) -> T: ... # type: ignore class NullType(DataType, metaclass=DataTypeSingleton): ... class AtomicType(DataType): ... @@ -85,8 +86,8 @@ class ShortType(IntegralType): class ArrayType(DataType): elementType: DataType containsNull: bool - def __init__(self, elementType=DataType, containsNull: bool = ...) -> None: ... - def simpleString(self): ... + def __init__(self, elementType: DataType, containsNull: bool = ...) -> None: ... + def simpleString(self) -> str: ... def jsonValue(self) -> Dict[str, Any]: ... @classmethod def fromJson(cls, json: Dict[str, Any]) -> ArrayType: ... @@ -197,8 +198,8 @@ class Row(tuple): class DateConverter: def can_convert(self, obj: Any) -> bool: ... - def convert(self, obj, gateway_client) -> Any: ... + def convert(self, obj: datetime.date, gateway_client: JavaGateway) -> JavaObject: ... class DatetimeConverter: - def can_convert(self, obj) -> bool: ... - def convert(self, obj, gateway_client) -> Any: ... + def can_convert(self, obj: Any) -> bool: ... + def convert(self, obj: datetime.datetime, gateway_client: JavaGateway) -> JavaObject: ... diff --git a/python/pyspark/sql/udf.pyi b/python/pyspark/sql/udf.pyi index 87c3672780037..ea61397a67ba1 100644 --- a/python/pyspark/sql/udf.pyi +++ b/python/pyspark/sql/udf.pyi @@ -18,8 +18,9 @@ from typing import Any, Callable, Optional -from pyspark.sql._typing import ColumnOrName, DataTypeOrString +from pyspark.sql._typing import ColumnOrName, DataTypeOrString, UserDefinedFunctionLike from pyspark.sql.column import Column +from pyspark.sql.types import DataType import pyspark.sql.session class UserDefinedFunction: @@ -35,7 +36,7 @@ class UserDefinedFunction: deterministic: bool = ..., ) -> None: ... @property - def returnType(self): ... + def returnType(self) -> DataType: ... def __call__(self, *cols: ColumnOrName) -> Column: ... def asNondeterministic(self) -> UserDefinedFunction: ... @@ -47,7 +48,7 @@ class UDFRegistration: name: str, f: Callable[..., Any], returnType: Optional[DataTypeOrString] = ..., - ): ... + ) -> UserDefinedFunctionLike: ... def registerJavaFunction( self, name: str, diff --git a/python/pyspark/streaming/context.pyi b/python/pyspark/streaming/context.pyi index 026163fc9a1db..117a6742e6b6b 100644 --- a/python/pyspark/streaming/context.pyi +++ b/python/pyspark/streaming/context.pyi @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -from typing import Any, Callable, List, Optional, TypeVar, Union +from typing import Any, Callable, List, Optional, TypeVar from py4j.java_gateway import JavaObject # type: ignore[import] diff --git a/python/pyspark/streaming/dstream.pyi b/python/pyspark/streaming/dstream.pyi index 7b76ce4c65233..1521d838fc2b5 100644 --- a/python/pyspark/streaming/dstream.pyi +++ b/python/pyspark/streaming/dstream.pyi @@ -30,9 +30,12 @@ from typing import ( ) import datetime from pyspark.rdd import RDD +import pyspark.serializers from pyspark.storagelevel import StorageLevel import pyspark.streaming.context +from py4j.java_gateway import JavaObject + S = TypeVar("S") T = TypeVar("T") U = TypeVar("U") @@ -42,7 +45,12 @@ V = TypeVar("V") class DStream(Generic[T]): is_cached: bool is_checkpointed: bool - def __init__(self, jdstream, ssc, jrdd_deserializer) -> None: ... + def __init__( + self, + jdstream: JavaObject, + ssc: pyspark.streaming.context.StreamingContext, + jrdd_deserializer: pyspark.serializers.Serializer, + ) -> None: ... def context(self) -> pyspark.streaming.context.StreamingContext: ... def count(self) -> DStream[int]: ... def filter(self, f: Callable[[T], bool]) -> DStream[T]: ... diff --git a/python/pyspark/streaming/kinesis.pyi b/python/pyspark/streaming/kinesis.pyi index af7cd6f6ec13c..399c37f869620 100644 --- a/python/pyspark/streaming/kinesis.pyi +++ b/python/pyspark/streaming/kinesis.pyi @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -from typing import Any, Callable, Optional, TypeVar +from typing import Callable, Optional, TypeVar from pyspark.storagelevel import StorageLevel from pyspark.streaming.context import StreamingContext from pyspark.streaming.dstream import DStream From 01321bc0fec54a1610d0873c17fa7354137d3a6b Mon Sep 17 00:00:00 2001 From: zero323 Date: Wed, 25 Nov 2020 10:24:41 +0900 Subject: [PATCH 003/150] [SPARK-33252][PYTHON][DOCS] Migration to NumPy documentation style in MLlib (pyspark.mllib.*) ### What changes were proposed in this pull request? This PR proposes migration of `pyspark.mllib` to NumPy documentation style. ### Why are the changes needed? To improve documentation style. Before: ![old](https://user-images.githubusercontent.com/1554276/100097941-90234980-2e5d-11eb-8b4d-c25d98d85191.png) After: ![new](https://user-images.githubusercontent.com/1554276/100097966-987b8480-2e5d-11eb-9e02-07b18c327624.png) ### Does this PR introduce _any_ user-facing change? Yes, this changes both rendered HTML docs and console representation (SPARK-33243). ### How was this patch tested? `dev/lint-python` and manual inspection. Closes #30413 from zero323/SPARK-33252. Authored-by: zero323 Signed-off-by: HyukjinKwon --- .../docs/source/reference/pyspark.mllib.rst | 3 +- python/pyspark/mllib/classification.py | 353 ++++++----- python/pyspark/mllib/clustering.py | 576 +++++++++++------- python/pyspark/mllib/evaluation.py | 60 +- python/pyspark/mllib/feature.py | 288 ++++++--- python/pyspark/mllib/feature.pyi | 4 +- python/pyspark/mllib/fpm.py | 86 +-- python/pyspark/mllib/fpm.pyi | 4 +- python/pyspark/mllib/linalg/__init__.py | 132 +++- python/pyspark/mllib/linalg/distributed.py | 495 ++++++++++----- python/pyspark/mllib/linalg/distributed.pyi | 6 +- python/pyspark/mllib/random.py | 378 ++++++++---- python/pyspark/mllib/recommendation.py | 116 ++-- python/pyspark/mllib/regression.py | 392 +++++++----- python/pyspark/mllib/stat/KernelDensity.py | 2 + python/pyspark/mllib/stat/__init__.py | 5 +- python/pyspark/mllib/stat/_statistics.py | 115 ++-- python/pyspark/mllib/stat/distribution.py | 2 + python/pyspark/mllib/tree.py | 469 +++++++------- python/pyspark/mllib/util.py | 256 +++++--- 20 files changed, 2375 insertions(+), 1367 deletions(-) diff --git a/python/docs/source/reference/pyspark.mllib.rst b/python/docs/source/reference/pyspark.mllib.rst index acc834c065ac3..df5ea017d0fbf 100644 --- a/python/docs/source/reference/pyspark.mllib.rst +++ b/python/docs/source/reference/pyspark.mllib.rst @@ -216,6 +216,8 @@ Statistics ChiSqTestResult MultivariateGaussian KernelDensity + ChiSqTestResult + KolmogorovSmirnovTestResult Tree @@ -250,4 +252,3 @@ Utilities Loader MLUtils Saveable - diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index bbca216cce493..bd43e91afd280 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -88,20 +88,26 @@ class LogisticRegressionModel(LinearClassificationModel): Classification model trained using Multinomial/Binary Logistic Regression. - :param weights: - Weights computed for every feature. - :param intercept: - Intercept computed for this model. (Only used in Binary Logistic - Regression. In Multinomial Logistic Regression, the intercepts will - not bea single value, so the intercepts will be part of the - weights.) - :param numFeatures: - The dimension of the features. - :param numClasses: - The number of possible outcomes for k classes classification problem - in Multinomial Logistic Regression. By default, it is binary - logistic regression so numClasses will be set to 2. + .. versionadded:: 0.9.0 + Parameters + ---------- + weights : :py:class:`pyspark.mllib.linalg.Vector` + Weights computed for every feature. + intercept : float + Intercept computed for this model. (Only used in Binary Logistic + Regression. In Multinomial Logistic Regression, the intercepts will + not be a single value, so the intercepts will be part of the + weights.) + numFeatures : int + The dimension of the features. + numClasses : int + The number of possible outcomes for k classes classification problem + in Multinomial Logistic Regression. By default, it is binary + logistic regression so numClasses will be set to 2. + + Examples + -------- >>> from pyspark.mllib.linalg import SparseVector >>> data = [ ... LabeledPoint(0.0, [0.0, 1.0]), @@ -159,8 +165,6 @@ class LogisticRegressionModel(LinearClassificationModel): 1 >>> mcm.predict([0.0, 0.0, 0.3]) 2 - - .. versionadded:: 0.9.0 """ def __init__(self, weights, intercept, numFeatures, numClasses): super(LogisticRegressionModel, self).__init__(weights, intercept) @@ -263,54 +267,60 @@ def __repr__(self): class LogisticRegressionWithSGD(object): """ + Train a classification model for Binary Logistic Regression using Stochastic Gradient Descent. + .. versionadded:: 0.9.0 - .. note:: Deprecated in 2.0.0. Use ml.classification.LogisticRegression or - LogisticRegressionWithLBFGS. + .. deprecated:: 2.0.0 + Use ml.classification.LogisticRegression or LogisticRegressionWithLBFGS. """ @classmethod - @since('0.9.0') def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, initialWeights=None, regParam=0.01, regType="l2", intercept=False, validateData=True, convergenceTol=0.001): """ Train a logistic regression model on the given data. - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param step: - The step parameter used in SGD. - (default: 1.0) - :param miniBatchFraction: - Fraction of data to be used for each SGD iteration. - (default: 1.0) - :param initialWeights: - The initial weights. - (default: None) - :param regParam: - The regularizer parameter. - (default: 0.01) - :param regType: - The type of regularizer used for training our model. - Supported values: + .. versionadded:: 0.9.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The training data, an RDD of :py:class:`pyspark.mllib.regression.LabeledPoint`. + iterations : int, optional + The number of iterations. + (default: 100) + step : float, optional + The step parameter used in SGD. + (default: 1.0) + miniBatchFraction : float, optional + Fraction of data to be used for each SGD iteration. + (default: 1.0) + initialWeights : :py:class:`pyspark.mllib.linalg.Vector` or convertible, optional + The initial weights. + (default: None) + regParam : float, optional + The regularizer parameter. + (default: 0.01) + regType : str, optional + The type of regularizer used for training our model. + Supported values: - "l1" for using L1 regularization - "l2" for using L2 regularization (default) - None for no regularization - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e., whether bias - features are activated or not). - (default: False) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param convergenceTol: - A condition which decides iteration termination. - (default: 0.001) + + intercept : bool, optional + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e., whether bias + features are activated or not). + (default: False) + validateData : bool, optional + Boolean parameter which indicates if the algorithm should + validate data before training. + (default: True) + convergenceTol : float, optional + A condition which decides iteration termination. + (default: 0.001) """ warnings.warn( "Deprecated in 2.0.0. Use ml.classification.LogisticRegression or " @@ -326,55 +336,65 @@ def train(rdd, i): class LogisticRegressionWithLBFGS(object): """ + Train a classification model for Multinomial/Binary Logistic Regression + using Limited-memory BFGS. + + Standard feature scaling and L2 regularization are used by default. .. versionadded:: 1.2.0 """ @classmethod - @since('1.2.0') def train(cls, data, iterations=100, initialWeights=None, regParam=0.0, regType="l2", intercept=False, corrections=10, tolerance=1e-6, validateData=True, numClasses=2): """ Train a logistic regression model on the given data. - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param initialWeights: - The initial weights. - (default: None) - :param regParam: - The regularizer parameter. - (default: 0.0) - :param regType: - The type of regularizer used for training our model. - Supported values: + .. versionadded:: 1.2.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The training data, an RDD of :py:class:`pyspark.mllib.regression.LabeledPoint`. + iterations : int, optional + The number of iterations. + (default: 100) + initialWeights : :py:class:`pyspark.mllib.linalg.Vector` or convertible, optional + The initial weights. + (default: None) + regParam : float, optional + The regularizer parameter. + (default: 0.01) + regType : str, optional + The type of regularizer used for training our model. + Supported values: - "l1" for using L1 regularization - "l2" for using L2 regularization (default) - None for no regularization - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e., whether bias - features are activated or not). - (default: False) - :param corrections: - The number of corrections used in the LBFGS update. - If a known updater is used for binary classification, - it calls the ml implementation and this parameter will - have no effect. (default: 10) - :param tolerance: - The convergence tolerance of iterations for L-BFGS. - (default: 1e-6) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param numClasses: - The number of classes (i.e., outcomes) a label can take in - Multinomial Logistic Regression. - (default: 2) + intercept : bool, optional + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e., whether bias + features are activated or not). + (default: False) + corrections : int, optional + The number of corrections used in the LBFGS update. + If a known updater is used for binary classification, + it calls the ml implementation and this parameter will + have no effect. (default: 10) + tolerance : float, optional + The convergence tolerance of iterations for L-BFGS. + (default: 1e-6) + validateData : bool, optional + Boolean parameter which indicates if the algorithm should + validate data before training. + (default: True) + numClasses : int, optional + The number of classes (i.e., outcomes) a label can take in + Multinomial Logistic Regression. + (default: 2) + + Examples + -------- >>> data = [ ... LabeledPoint(0.0, [0.0, 1.0]), ... LabeledPoint(1.0, [1.0, 0.0]), @@ -406,11 +426,17 @@ class SVMModel(LinearClassificationModel): """ Model for Support Vector Machines (SVMs). - :param weights: - Weights computed for every feature. - :param intercept: - Intercept computed for this model. + .. versionadded:: 0.9.0 + + Parameters + ---------- + weights : :py:class:`pyspark.mllib.linalg.Vector` + Weights computed for every feature. + intercept : float + Intercept computed for this model. + Examples + -------- >>> from pyspark.mllib.linalg import SparseVector >>> data = [ ... LabeledPoint(0.0, [0.0]), @@ -451,8 +477,6 @@ class SVMModel(LinearClassificationModel): ... rmtree(path) ... except: ... pass - - .. versionadded:: 0.9.0 """ def __init__(self, weights, intercept): super(SVMModel, self).__init__(weights, intercept) @@ -501,53 +525,59 @@ def load(cls, sc, path): class SVMWithSGD(object): """ + Train a Support Vector Machine (SVM) using Stochastic Gradient Descent. + .. versionadded:: 0.9.0 """ @classmethod - @since('0.9.0') def train(cls, data, iterations=100, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None, regType="l2", intercept=False, validateData=True, convergenceTol=0.001): """ Train a support vector machine on the given data. - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param step: - The step parameter used in SGD. - (default: 1.0) - :param regParam: - The regularizer parameter. - (default: 0.01) - :param miniBatchFraction: - Fraction of data to be used for each SGD iteration. - (default: 1.0) - :param initialWeights: - The initial weights. - (default: None) - :param regType: - The type of regularizer used for training our model. - Allowed values: + .. versionadded:: 0.9.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The training data, an RDD of :py:class:`pyspark.mllib.regression.LabeledPoint`. + iterations : int, optional + The number of iterations. + (default: 100) + step : float, optional + The step parameter used in SGD. + (default: 1.0) + regParam : float, optional + The regularizer parameter. + (default: 0.01) + miniBatchFraction : float, optional + Fraction of data to be used for each SGD iteration. + (default: 1.0) + initialWeights : :py:class:`pyspark.mllib.linalg.Vector` or convertible, optional + The initial weights. + (default: None) + regType : str, optional + The type of regularizer used for training our model. + Allowed values: - "l1" for using L1 regularization - "l2" for using L2 regularization (default) - None for no regularization - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e. whether bias - features are activated or not). - (default: False) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param convergenceTol: - A condition which decides iteration termination. - (default: 0.001) + + intercept : bool, optional + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e. whether bias + features are activated or not). + (default: False) + validateData : bool, optional + Boolean parameter which indicates if the algorithm should + validate data before training. + (default: True) + convergenceTol : float, optional + A condition which decides iteration termination. + (default: 0.001) """ def train(rdd, i): return callMLlibFunc("trainSVMModelWithSGD", rdd, int(iterations), float(step), @@ -563,14 +593,20 @@ class NaiveBayesModel(Saveable, Loader): """ Model for Naive Bayes classifiers. - :param labels: - List of labels. - :param pi: - Log of class priors, whose dimension is C, number of labels. - :param theta: - Log of class conditional probabilities, whose dimension is C-by-D, - where D is number of features. + .. versionadded:: 0.9.0 + Parameters + ---------- + labels : :py:class:`numpy.ndarray` + List of labels. + pi : :py:class:`numpy.ndarray` + Log of class priors, whose dimension is C, number of labels. + theta : :py:class:`numpy.ndarray` + Log of class conditional probabilities, whose dimension is C-by-D, + where D is number of features. + + Examples + -------- >>> from pyspark.mllib.linalg import SparseVector >>> data = [ ... LabeledPoint(0.0, [0.0, 0.0]), @@ -605,8 +641,6 @@ class NaiveBayesModel(Saveable, Loader): ... rmtree(path) ... except OSError: ... pass - - .. versionadded:: 0.9.0 """ def __init__(self, labels, pi, theta): self.labels = labels @@ -652,11 +686,12 @@ def load(cls, sc, path): class NaiveBayes(object): """ + Train a Multinomial Naive Bayes model. + .. versionadded:: 0.9.0 """ @classmethod - @since('0.9.0') def train(cls, data, lambda_=1.0): """ Train a Naive Bayes model given an RDD of (label, features) @@ -669,11 +704,15 @@ def train(cls, data, lambda_=1.0): it can also be used as `Bernoulli NB `_. The input feature values must be nonnegative. - :param data: - RDD of LabeledPoint. - :param lambda_: - The smoothing parameter. - (default: 1.0) + .. versionadded:: 0.9.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The training data, an RDD of :py:class:`pyspark.mllib.regression.LabeledPoint`. + lambda\\_ : float, optional + The smoothing parameter. + (default: 1.0) """ first = data.first() if not isinstance(first, LabeledPoint): @@ -694,23 +733,25 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm): of features must be constant. An initial weight vector must be provided. - :param stepSize: - Step size for each iteration of gradient descent. - (default: 0.1) - :param numIterations: - Number of iterations run for each batch of data. - (default: 50) - :param miniBatchFraction: - Fraction of each batch of data to use for updates. - (default: 1.0) - :param regParam: - L2 Regularization parameter. - (default: 0.0) - :param convergenceTol: - Value used to determine when to terminate iterations. - (default: 0.001) - .. versionadded:: 1.5.0 + + Parameters + ---------- + stepSize : float, optional + Step size for each iteration of gradient descent. + (default: 0.1) + numIterations : int, optional + Number of iterations run for each batch of data. + (default: 50) + miniBatchFraction : float, optional + Fraction of each batch of data to use for updates. + (default: 1.0) + regParam : float, optional + L2 Regularization parameter. + (default: 0.0) + convergenceTol : float, optional + Value used to determine when to terminate iterations. + (default: 0.001) """ def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regParam=0.0, convergenceTol=0.001): diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index b99a4150c396d..e1a009643c5f2 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -41,6 +41,10 @@ class BisectingKMeansModel(JavaModelWrapper): """ A clustering model derived from the bisecting k-means method. + .. versionadded:: 2.0.0 + + Examples + -------- >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2) >>> bskm = BisectingKMeans() >>> model = bskm.train(sc.parallelize(data, 2), k=4) @@ -51,8 +55,6 @@ class BisectingKMeansModel(JavaModelWrapper): 4 >>> model.computeCost(p) 0.0 - - .. versionadded:: 2.0.0 """ def __init__(self, java_model): @@ -72,17 +74,25 @@ def k(self): """Get the number of clusters""" return self.call("k") - @since('2.0.0') def predict(self, x): """ Find the cluster that each of the points belongs to in this model. - :param x: - A data point (or RDD of points) to determine cluster index. - :return: - Predicted cluster index or an RDD of predicted cluster indices - if the input is an RDD. + .. versionadded:: 2.0.0 + + Parameters + ---------- + x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + A data point (or RDD of points) to determine cluster index. + :py:class:`pyspark.mllib.linalg.Vector` can be replaced with equivalent + objects (list, tuple, numpy.ndarray). + + Returns + ------- + int or :py:class:`pyspark.RDD` of int + Predicted cluster index or an RDD of predicted cluster indices + if the input is an RDD. """ if isinstance(x, RDD): vecs = x.map(_convert_to_vector) @@ -91,15 +101,20 @@ def predict(self, x): x = _convert_to_vector(x) return self.call("predict", x) - @since('2.0.0') def computeCost(self, x): """ Return the Bisecting K-means cost (sum of squared distances of points to their nearest center) for this model on the given data. If provided with an RDD of points returns the sum. - :param point: - A data point (or RDD of points) to compute the cost(s). + .. versionadded:: 2.0.0 + + Parameters + ---------- + point : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + A data point (or RDD of points) to compute the cost(s). + :py:class:`pyspark.mllib.linalg.Vector` can be replaced with equivalent + objects (list, tuple, numpy.ndarray). """ if isinstance(x, RDD): vecs = x.map(_convert_to_vector) @@ -122,37 +137,43 @@ class BisectingKMeans(object): clusters on the bottom level would result more than `k` leaf clusters, larger clusters get higher priority. - Based on - `Steinbach, Karypis, and Kumar, A comparison of document clustering - techniques, KDD Workshop on Text Mining, 2000 - `_. - .. versionadded:: 2.0.0 + + Notes + ----- + See the original paper [1]_ + + .. [1] Steinbach, M. et al. “A Comparison of Document Clustering Techniques.” (2000). + KDD Workshop on Text Mining, 2000 + http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf """ @classmethod - @since('2.0.0') def train(self, rdd, k=4, maxIterations=20, minDivisibleClusterSize=1.0, seed=-1888008604): """ Runs the bisecting k-means algorithm return the model. - :param rdd: - Training points as an `RDD` of `Vector` or convertible - sequence types. - :param k: - The desired number of leaf clusters. The actual number could - be smaller if there are no divisible leaf clusters. - (default: 4) - :param maxIterations: - Maximum number of iterations allowed to split clusters. - (default: 20) - :param minDivisibleClusterSize: - Minimum number of points (if >= 1.0) or the minimum proportion - of points (if < 1.0) of a divisible cluster. - (default: 1) - :param seed: - Random seed value for cluster initialization. - (default: -1888008604 from classOf[BisectingKMeans].getName.##) + .. versionadded:: 2.0.0 + + Parameters + ---------- + rdd : :py:class:`pyspark.RDD` + Training points as an `RDD` of `Vector` or convertible + sequence types. + k : int, optional + The desired number of leaf clusters. The actual number could + be smaller if there are no divisible leaf clusters. + (default: 4) + maxIterations : int, optional + Maximum number of iterations allowed to split clusters. + (default: 20) + minDivisibleClusterSize : float, optional + Minimum number of points (if >= 1.0) or the minimum proportion + of points (if < 1.0) of a divisible cluster. + (default: 1) + seed : int, optional + Random seed value for cluster initialization. + (default: -1888008604 from classOf[BisectingKMeans].getName.##) """ java_model = callMLlibFunc( "trainBisectingKMeans", rdd.map(_convert_to_vector), @@ -165,6 +186,10 @@ class KMeansModel(Saveable, Loader): """A clustering model derived from the k-means method. + .. versionadded:: 0.9.0 + + Examples + -------- >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2) >>> model = KMeans.train( ... sc.parallelize(data), 2, maxIterations=10, initializationMode="random", @@ -213,8 +238,6 @@ class KMeansModel(Saveable, Loader): ... initialModel = KMeansModel([(-1000.0,-1000.0),(5.0,5.0),(1000.0,1000.0)])) >>> model.clusterCenters [array([-1000., -1000.]), array([ 5., 5.]), array([ 1000., 1000.])] - - .. versionadded:: 0.9.0 """ def __init__(self, centers): @@ -232,17 +255,25 @@ def k(self): """Total number of clusters.""" return len(self.centers) - @since('0.9.0') def predict(self, x): """ Find the cluster that each of the points belongs to in this model. - :param x: - A data point (or RDD of points) to determine cluster index. - :return: - Predicted cluster index or an RDD of predicted cluster indices - if the input is an RDD. + .. versionadded:: 0.9.0 + + Parameters + ---------- + x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + A data point (or RDD of points) to determine cluster index. + :py:class:`pyspark.mllib.linalg.Vector` can be replaced with equivalent + objects (list, tuple, numpy.ndarray). + + Returns + ------- + int or :py:class:`pyspark.RDD` of int + Predicted cluster index or an RDD of predicted cluster indices + if the input is an RDD. """ best = 0 best_distance = float("inf") @@ -257,15 +288,18 @@ def predict(self, x): best_distance = distance return best - @since('1.4.0') def computeCost(self, rdd): """ Return the K-means cost (sum of squared distances of points to their nearest center) for this model on the given data. - :param rdd: - The RDD of points to compute the cost on. + .. versionadded:: 1.4.0 + + Parameters + ---------- + rdd : ::py:class:`pyspark.RDD` + The RDD of points to compute the cost on. """ cost = callMLlibFunc("computeCostKmeansModel", rdd.map(_convert_to_vector), [_convert_to_vector(c) for c in self.centers]) @@ -292,46 +326,51 @@ def load(cls, sc, path): class KMeans(object): """ + K-means clustering. + .. versionadded:: 0.9.0 """ @classmethod - @since('0.9.0') def train(cls, rdd, k, maxIterations=100, initializationMode="k-means||", seed=None, initializationSteps=2, epsilon=1e-4, initialModel=None): """ Train a k-means clustering model. - :param rdd: - Training points as an `RDD` of `Vector` or convertible - sequence types. - :param k: - Number of clusters to create. - :param maxIterations: - Maximum number of iterations allowed. - (default: 100) - :param initializationMode: - The initialization algorithm. This can be either "random" or - "k-means||". - (default: "k-means||") - :param seed: - Random seed value for cluster initialization. Set as None to - generate seed based on system time. - (default: None) - :param initializationSteps: - Number of steps for the k-means|| initialization mode. - This is an advanced setting -- the default of 2 is almost - always enough. - (default: 2) - :param epsilon: - Distance threshold within which a center will be considered to - have converged. If all centers move less than this Euclidean - distance, iterations are stopped. - (default: 1e-4) - :param initialModel: - Initial cluster centers can be provided as a KMeansModel object - rather than using the random or k-means|| initializationModel. - (default: None) + .. versionadded:: 0.9.0 + + Parameters + ---------- + rdd : ::py:class:`pyspark.RDD` + Training points as an `RDD` of :py:class:`pyspark.mllib.linalg.Vector` + or convertible sequence types. + k : int + Number of clusters to create. + maxIterations : int, optional + Maximum number of iterations allowed. + (default: 100) + initializationMode : str, optional + The initialization algorithm. This can be either "random" or + "k-means||". + (default: "k-means||") + seed : int, optional + Random seed value for cluster initialization. Set as None to + generate seed based on system time. + (default: None) + initializationSteps : + Number of steps for the k-means|| initialization mode. + This is an advanced setting -- the default of 2 is almost + always enough. + (default: 2) + epsilon : float, optional + Distance threshold within which a center will be considered to + have converged. If all centers move less than this Euclidean + distance, iterations are stopped. + (default: 1e-4) + initialModel : :py:class:`KMeansModel`, optional + Initial cluster centers can be provided as a KMeansModel object + rather than using the random or k-means|| initializationModel. + (default: None) """ clusterInitialModel = [] if initialModel is not None: @@ -352,6 +391,10 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, JavaLoader): """ A clustering model derived from the Gaussian Mixture Model method. + .. versionadded:: 1.3.0 + + Examples + -------- >>> from pyspark.mllib.linalg import Vectors, DenseMatrix >>> from numpy.testing import assert_equal >>> from shutil import rmtree @@ -410,8 +453,6 @@ class GaussianMixtureModel(JavaModelWrapper, JavaSaveable, JavaLoader): True >>> labels[2]==labels[3]==labels[4] True - - .. versionadded:: 1.3.0 """ @property @@ -440,17 +481,23 @@ def k(self): """Number of gaussians in mixture.""" return len(self.weights) - @since('1.3.0') def predict(self, x): """ Find the cluster to which the point 'x' or each point in RDD 'x' has maximum membership in this model. - :param x: - A feature vector or an RDD of vectors representing data points. - :return: - Predicted cluster label or an RDD of predicted cluster labels - if the input is an RDD. + .. versionadded:: 1.3.0 + + Parameters + ---------- + x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + A feature vector or an RDD of vectors representing data points. + + Returns + ------- + numpy.float64 or :py:class:`pyspark.RDD` of int + Predicted cluster label or an RDD of predicted cluster labels + if the input is an RDD. """ if isinstance(x, RDD): cluster_labels = self.predictSoft(x).map(lambda z: z.index(max(z))) @@ -459,16 +506,22 @@ def predict(self, x): z = self.predictSoft(x) return z.argmax() - @since('1.3.0') def predictSoft(self, x): """ Find the membership of point 'x' or each point in RDD 'x' to all mixture components. - :param x: - A feature vector or an RDD of vectors representing data points. - :return: - The membership value to all mixture components for vector 'x' - or each vector in RDD 'x'. + .. versionadded:: 1.3.0 + + Parameters + ---------- + x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + A feature vector or an RDD of vectors representing data points. + + Returns + ------- + numpy.ndarray or :py:class:`pyspark.RDD` + The membership value to all mixture components for vector 'x' + or each vector in RDD 'x'. """ if isinstance(x, RDD): means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians]) @@ -479,14 +532,16 @@ def predictSoft(self, x): return self.call("predictSoft", _convert_to_vector(x)).toArray() @classmethod - @since('1.5.0') def load(cls, sc, path): """Load the GaussianMixtureModel from disk. - :param sc: - SparkContext. - :param path: - Path to where the model is stored. + .. versionadded:: 1.5.0 + + Parameters + ---------- + sc : :py:class:`SparkContext` + path : str + Path to where the model is stored. """ model = cls._load_java(sc, path) wrapper = sc._jvm.org.apache.spark.mllib.api.python.GaussianMixtureModelWrapper(model) @@ -499,32 +554,36 @@ class GaussianMixture(object): .. versionadded:: 1.3.0 """ + @classmethod - @since('1.3.0') def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None, initialModel=None): """ Train a Gaussian Mixture clustering model. - :param rdd: - Training points as an `RDD` of `Vector` or convertible - sequence types. - :param k: - Number of independent Gaussians in the mixture model. - :param convergenceTol: - Maximum change in log-likelihood at which convergence is - considered to have occurred. - (default: 1e-3) - :param maxIterations: - Maximum number of iterations allowed. - (default: 100) - :param seed: - Random seed for initial Gaussian distribution. Set as None to - generate seed based on system time. - (default: None) - :param initialModel: - Initial GMM starting point, bypassing the random - initialization. - (default: None) + .. versionadded:: 1.3.0 + + Parameters + ---------- + rdd : ::py:class:`pyspark.RDD` + Training points as an `RDD` of :py:class:`pyspark.mllib.linalg.Vector` + or convertible sequence types. + k : int + Number of independent Gaussians in the mixture model. + convergenceTol : float, optional + Maximum change in log-likelihood at which convergence is + considered to have occurred. + (default: 1e-3) + maxIterations : int, optional + Maximum number of iterations allowed. + (default: 100) + seed : int, optional + Random seed for initial Gaussian distribution. Set as None to + generate seed based on system time. + (default: None) + initialModel : GaussianMixtureModel, optional + Initial GMM starting point, bypassing the random + initialization. + (default: None) """ initialModelWeights = None initialModelMu = None @@ -545,8 +604,12 @@ def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None, initia class PowerIterationClusteringModel(JavaModelWrapper, JavaSaveable, JavaLoader): """ - Model produced by [[PowerIterationClustering]]. + Model produced by :py:class:`PowerIterationClustering`. + .. versionadded:: 1.5.0 + + Examples + -------- >>> import math >>> def genCircle(r, n): ... points = [] @@ -589,8 +652,6 @@ class PowerIterationClusteringModel(JavaModelWrapper, JavaSaveable, JavaLoader): ... rmtree(path) ... except OSError: ... pass - - .. versionadded:: 1.5.0 """ @property @@ -623,37 +684,48 @@ def load(cls, sc, path): class PowerIterationClustering(object): """ - Power Iteration Clustering (PIC), a scalable graph clustering algorithm - developed by [[http://www.cs.cmu.edu/~frank/papers/icml2010-pic-final.pdf Lin and Cohen]]. - From the abstract: PIC finds a very low-dimensional embedding of a - dataset using truncated power iteration on a normalized pair-wise - similarity matrix of the data. + Power Iteration Clustering (PIC), a scalable graph clustering algorithm. + + + Developed by Lin and Cohen [1]_. From the abstract: + + "PIC finds a very low-dimensional embedding of a + dataset using truncated power iteration on a normalized pair-wise + similarity matrix of the data." .. versionadded:: 1.5.0 + + .. [1] Lin, Frank & Cohen, William. (2010). Power Iteration Clustering. + http://www.cs.cmu.edu/~frank/papers/icml2010-pic-final.pdf """ @classmethod - @since('1.5.0') def train(cls, rdd, k, maxIterations=100, initMode="random"): r""" - :param rdd: - An RDD of (i, j, s\ :sub:`ij`\) tuples representing the - affinity matrix, which is the matrix A in the PIC paper. The - similarity s\ :sub:`ij`\ must be nonnegative. This is a symmetric - matrix and hence s\ :sub:`ij`\ = s\ :sub:`ji`\ For any (i, j) with - nonzero similarity, there should be either (i, j, s\ :sub:`ij`\) or - (j, i, s\ :sub:`ji`\) in the input. Tuples with i = j are ignored, - because it is assumed s\ :sub:`ij`\ = 0.0. - :param k: - Number of clusters. - :param maxIterations: - Maximum number of iterations of the PIC algorithm. - (default: 100) - :param initMode: - Initialization mode. This can be either "random" to use - a random vector as vertex properties, or "degree" to use - normalized sum similarities. - (default: "random") + Train PowerIterationClusteringModel + + .. versionadded:: 1.5.0 + + Parameters + ---------- + rdd : :py:class:`pyspark.RDD` + An RDD of (i, j, s\ :sub:`ij`\) tuples representing the + affinity matrix, which is the matrix A in the PIC paper. The + similarity s\ :sub:`ij`\ must be nonnegative. This is a symmetric + matrix and hence s\ :sub:`ij`\ = s\ :sub:`ji`\ For any (i, j) with + nonzero similarity, there should be either (i, j, s\ :sub:`ij`\) or + (j, i, s\ :sub:`ji`\) in the input. Tuples with i = j are ignored, + because it is assumed s\ :sub:`ij`\ = 0.0. + k : int + Number of clusters. + maxIterations : int, optional + Maximum number of iterations of the PIC algorithm. + (default: 100) + initMode : str, optional + Initialization mode. This can be either "random" to use + a random vector as vertex properties, or "degree" to use + normalized sum similarities. + (default: "random") """ model = callMLlibFunc("trainPowerIterationClusteringModel", rdd.map(_convert_to_vector), int(k), int(maxIterations), initMode) @@ -673,29 +745,37 @@ class StreamingKMeansModel(KMeansModel): The update formula for each centroid is given by - * c_t+1 = ((c_t * n_t * a) + (x_t * m_t)) / (n_t + m_t) - * n_t+1 = n_t * a + m_t + - c_t+1 = ((c_t * n_t * a) + (x_t * m_t)) / (n_t + m_t) + - n_t+1 = n_t * a + m_t where - * c_t: Centroid at the n_th iteration. - * n_t: Number of samples (or) weights associated with the centroid - at the n_th iteration. - * x_t: Centroid of the new data closest to c_t. - * m_t: Number of samples (or) weights of the new data closest to c_t - * c_t+1: New centroid. - * n_t+1: New number of weights. - * a: Decay Factor, which gives the forgetfulness. + - c_t: Centroid at the n_th iteration. + - n_t: Number of samples (or) weights associated with the centroid + at the n_th iteration. + - x_t: Centroid of the new data closest to c_t. + - m_t: Number of samples (or) weights of the new data closest to c_t + - c_t+1: New centroid. + - n_t+1: New number of weights. + - a: Decay Factor, which gives the forgetfulness. - .. note:: If a is set to 1, it is the weighted mean of the previous - and new data. If it set to zero, the old centroids are completely - forgotten. - - :param clusterCenters: - Initial cluster centers. - :param clusterWeights: - List of weights assigned to each cluster. + .. versionadded:: 1.5.0 + Parameters + ---------- + clusterCenters : list of :py:class:`pyspark.mllib.linalg.Vector` or covertible + Initial cluster centers. + clusterWeights : :py:class:`pyspark.mllib.linalg.Vector` or covertible + List of weights assigned to each cluster. + + Notes + ----- + If a is set to 1, it is the weighted mean of the previous + and new data. If it set to zero, the old centroids are completely + forgotten. + + Examples + -------- >>> initCenters = [[0.0, 0.0], [1.0, 1.0]] >>> initWeights = [1.0, 1.0] >>> stkm = StreamingKMeansModel(initCenters, initWeights) @@ -723,8 +803,6 @@ class StreamingKMeansModel(KMeansModel): 0 >>> stkm.predict([1.5, 1.5]) 1 - - .. versionadded:: 1.5.0 """ def __init__(self, clusterCenters, clusterWeights): super(StreamingKMeansModel, self).__init__(centers=clusterCenters) @@ -740,14 +818,18 @@ def clusterWeights(self): def update(self, data, decayFactor, timeUnit): """Update the centroids, according to data - :param data: - RDD with new data for the model update. - :param decayFactor: - Forgetfulness of the previous centroids. - :param timeUnit: - Can be "batches" or "points". If points, then the decay factor - is raised to the power of number of new points and if batches, - then decay factor will be used as is. + .. versionadded:: 1.5.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + RDD with new data for the model update. + decayFactor : float + Forgetfulness of the previous centroids. + timeUnit : str + Can be "batches" or "points". If points, then the decay factor + is raised to the power of number of new points and if batches, + then decay factor will be used as is. """ if not isinstance(data, RDD): raise TypeError("Data should be of an RDD, got %s." % type(data)) @@ -772,19 +854,21 @@ class StreamingKMeans(object): More details on how the centroids are updated are provided under the docs of StreamingKMeansModel. - :param k: - Number of clusters. - (default: 2) - :param decayFactor: - Forgetfulness of the previous centroids. - (default: 1.0) - :param timeUnit: - Can be "batches" or "points". If points, then the decay factor is - raised to the power of number of new points and if batches, then - decay factor will be used as is. - (default: "batches") - .. versionadded:: 1.5.0 + + Parameters + ---------- + k : int, optional + Number of clusters. + (default: 2) + decayFactor : float, optional + Forgetfulness of the previous centroids. + (default: 1.0) + timeUnit : str, optional + Can be "batches" or "points". If points, then the decay factor is + raised to the power of number of new points and if batches, then + decay factor will be used as is. + (default: "batches") """ def __init__(self, k=2, decayFactor=1.0, timeUnit="batches"): self._k = k @@ -887,13 +971,23 @@ class LDAModel(JavaModelWrapper, JavaSaveable, Loader): Latent Dirichlet Allocation (LDA), a topic model designed for text documents. Terminology + - "word" = "term": an element of the vocabulary - "token": instance of a term appearing in a document - "topic": multinomial distribution over words representing some concept - References: - - Original LDA paper (journal version): - Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003. + .. versionadded:: 1.5.0 + + Notes + ----- + See the original LDA paper (journal version) [1]_ + + .. [1] Blei, D. et al. "Latent Dirichlet Allocation." + J. Mach. Learn. Res. 3 (2003): 993-1022. + https://www.jmlr.org/papers/v3/blei03a + + Examples + -------- >>> from pyspark.mllib.linalg import Vectors >>> from numpy.testing import assert_almost_equal, assert_equal >>> data = [ @@ -925,8 +1019,6 @@ class LDAModel(JavaModelWrapper, JavaSaveable, Loader): ... rmtree(path) ... except OSError: ... pass - - .. versionadded:: 1.5.0 """ @since('1.5.0') @@ -939,19 +1031,24 @@ def vocabSize(self): """Vocabulary size (number of terms or terms in the vocabulary)""" return self.call("vocabSize") - @since('1.6.0') def describeTopics(self, maxTermsPerTopic=None): """Return the topics described by weighted terms. - WARNING: If vocabSize and k are large, this can return a large object! - - :param maxTermsPerTopic: - Maximum number of terms to collect for each topic. - (default: vocabulary size) - :return: - Array over topics. Each topic is represented as a pair of - matching arrays: (term indices, term weights in topic). - Each topic's terms are sorted in order of decreasing weight. + .. versionadded:: 1.6.0 + .. warning:: If vocabSize and k are large, this can return a large object! + + Parameters + ---------- + maxTermsPerTopic : int, optional + Maximum number of terms to collect for each topic. + (default: vocabulary size) + + Returns + ------- + list + Array over topics. Each topic is represented as a pair of + matching arrays: (term indices, term weights in topic). + Each topic's terms are sorted in order of decreasing weight. """ if maxTermsPerTopic is None: topics = self.call("describeTopics") @@ -960,14 +1057,16 @@ def describeTopics(self, maxTermsPerTopic=None): return topics @classmethod - @since('1.5.0') def load(cls, sc, path): """Load the LDAModel from disk. - :param sc: - SparkContext. - :param path: - Path to where the model is stored. + .. versionadded:: 1.5.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + path : str + Path to where the model is stored. """ if not isinstance(sc, SparkContext): raise TypeError("sc should be a SparkContext, got type %s" % type(sc)) @@ -979,47 +1078,52 @@ def load(cls, sc, path): class LDA(object): """ + Train Latent Dirichlet Allocation (LDA) model. + .. versionadded:: 1.5.0 """ @classmethod - @since('1.5.0') def train(cls, rdd, k=10, maxIterations=20, docConcentration=-1.0, topicConcentration=-1.0, seed=None, checkpointInterval=10, optimizer="em"): """Train a LDA model. - :param rdd: - RDD of documents, which are tuples of document IDs and term - (word) count vectors. The term count vectors are "bags of - words" with a fixed-size vocabulary (where the vocabulary size - is the length of the vector). Document IDs must be unique - and >= 0. - :param k: - Number of topics to infer, i.e., the number of soft cluster - centers. - (default: 10) - :param maxIterations: - Maximum number of iterations allowed. - (default: 20) - :param docConcentration: - Concentration parameter (commonly named "alpha") for the prior - placed on documents' distributions over topics ("theta"). - (default: -1.0) - :param topicConcentration: - Concentration parameter (commonly named "beta" or "eta") for - the prior placed on topics' distributions over terms. - (default: -1.0) - :param seed: - Random seed for cluster initialization. Set as None to generate - seed based on system time. - (default: None) - :param checkpointInterval: - Period (in iterations) between checkpoints. - (default: 10) - :param optimizer: - LDAOptimizer used to perform the actual calculation. Currently - "em", "online" are supported. - (default: "em") + .. versionadded:: 1.5.0 + + Parameters + ---------- + rdd : :py:class:`pyspark.RDD` + RDD of documents, which are tuples of document IDs and term + (word) count vectors. The term count vectors are "bags of + words" with a fixed-size vocabulary (where the vocabulary size + is the length of the vector). Document IDs must be unique + and >= 0. + k : int, optional + Number of topics to infer, i.e., the number of soft cluster + centers. + (default: 10) + maxIterations : int, optional + Maximum number of iterations allowed. + (default: 20) + docConcentration : float, optional + Concentration parameter (commonly named "alpha") for the prior + placed on documents' distributions over topics ("theta"). + (default: -1.0) + topicConcentration : float, optional + Concentration parameter (commonly named "beta" or "eta") for + the prior placed on topics' distributions over terms. + (default: -1.0) + seed : int, optional + Random seed for cluster initialization. Set as None to generate + seed based on system time. + (default: None) + checkpointInterval : int, optional + Period (in iterations) between checkpoints. + (default: 10) + optimizer : str, optional + LDAOptimizer used to perform the actual calculation. Currently + "em", "online" are supported. + (default: "em") """ model = callMLlibFunc("trainLDAModel", rdd, k, maxIterations, docConcentration, topicConcentration, seed, diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py index f3be827fb6e4f..198a9791774a9 100644 --- a/python/pyspark/mllib/evaluation.py +++ b/python/pyspark/mllib/evaluation.py @@ -30,8 +30,15 @@ class BinaryClassificationMetrics(JavaModelWrapper): """ Evaluator for binary classification. - :param scoreAndLabels: an RDD of score, label and optional weight. + .. versionadded:: 1.4.0 + + Parameters + ---------- + scoreAndLabels : :py:class:`pyspark.RDD` + an RDD of score, label and optional weight. + Examples + -------- >>> scoreAndLabels = sc.parallelize([ ... (0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)], 2) >>> metrics = BinaryClassificationMetrics(scoreAndLabels) @@ -48,8 +55,6 @@ class BinaryClassificationMetrics(JavaModelWrapper): 0.79... >>> metrics.areaUnderPR 0.88... - - .. versionadded:: 1.4.0 """ def __init__(self, scoreAndLabels): @@ -95,8 +100,15 @@ class RegressionMetrics(JavaModelWrapper): """ Evaluator for regression. - :param predictionAndObservations: an RDD of prediction, observation and optional weight. + .. versionadded:: 1.4.0 + + Parameters + ---------- + predictionAndObservations : :py:class:`pyspark.RDD` + an RDD of prediction, observation and optional weight. + Examples + -------- >>> predictionAndObservations = sc.parallelize([ ... (2.5, 3.0), (0.0, -0.5), (2.0, 2.0), (8.0, 7.0)]) >>> metrics = RegressionMetrics(predictionAndObservations) @@ -115,8 +127,6 @@ class RegressionMetrics(JavaModelWrapper): >>> metrics = RegressionMetrics(predictionAndObservationsWithOptWeight) >>> metrics.rootMeanSquaredError 0.68... - - .. versionadded:: 1.4.0 """ def __init__(self, predictionAndObservations): @@ -182,9 +192,15 @@ class MulticlassMetrics(JavaModelWrapper): """ Evaluator for multiclass classification. - :param predictionAndLabels: an RDD of prediction, label, optional weight - and optional probability. + .. versionadded:: 1.4.0 + + Parameters + ---------- + predictionAndLabels : :py:class:`pyspark.RDD` + an RDD of prediction, label, optional weight and optional probability. + Examples + -------- >>> predictionAndLabels = sc.parallelize([(0.0, 0.0), (0.0, 1.0), (0.0, 0.0), ... (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)]) >>> metrics = MulticlassMetrics(predictionAndLabels) @@ -246,8 +262,6 @@ class MulticlassMetrics(JavaModelWrapper): >>> metrics = MulticlassMetrics(predictionAndLabelsWithProbabilities) >>> metrics.logLoss() 0.9682... - - .. versionadded:: 1.4.0 """ def __init__(self, predictionAndLabels): @@ -377,9 +391,15 @@ class RankingMetrics(JavaModelWrapper): """ Evaluator for ranking algorithms. - :param predictionAndLabels: an RDD of (predicted ranking, - ground truth set) pairs. + .. versionadded:: 1.4.0 + Parameters + ---------- + predictionAndLabels : :py:class:`pyspark.RDD` + an RDD of (predicted ranking, ground truth set) pairs. + + Examples + -------- >>> predictionAndLabels = sc.parallelize([ ... ([1, 6, 2, 7, 8, 3, 9, 10, 4, 5], [1, 2, 3, 4, 5]), ... ([4, 1, 5, 6, 2, 7, 3, 8, 9, 10], [1, 2, 3]), @@ -407,8 +427,6 @@ class RankingMetrics(JavaModelWrapper): 0.35... >>> metrics.recallAt(15) 0.66... - - .. versionadded:: 1.4.0 """ def __init__(self, predictionAndLabels): @@ -484,10 +502,16 @@ class MultilabelMetrics(JavaModelWrapper): """ Evaluator for multilabel classification. - :param predictionAndLabels: an RDD of (predictions, labels) pairs, - both are non-null Arrays, each with - unique elements. + .. versionadded:: 1.4.0 + + Parameters + ---------- + predictionAndLabels : :py:class:`pyspark.RDD` + an RDD of (predictions, labels) pairs, + both are non-null Arrays, each with unique elements. + Examples + -------- >>> predictionAndLabels = sc.parallelize([([0.0, 1.0], [0.0, 2.0]), ([0.0, 2.0], [0.0, 1.0]), ... ([], [0.0]), ([2.0], [2.0]), ([2.0, 0.0], [2.0, 0.0]), ... ([0.0, 1.0, 2.0], [0.0, 1.0]), ([1.0], [1.0, 2.0])]) @@ -516,8 +540,6 @@ class MultilabelMetrics(JavaModelWrapper): 0.28... >>> metrics.accuracy 0.54... - - .. versionadded:: 1.4.0 """ def __init__(self, predictionAndLabels): diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index d95f9197eaedf..1d37ab815655b 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -41,7 +41,10 @@ def transform(self, vector): """ Applies transformation on a vector. - :param vector: vector to be transformed. + Parameters + ---------- + vector : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + vector or convertible or RDD to be transformed. """ raise NotImplementedError @@ -56,8 +59,15 @@ class Normalizer(VectorTransformer): For `p` = float('inf'), max(abs(vector)) will be used as norm for normalization. - :param p: Normalization in L^p^ space, p = 2 by default. + .. versionadded:: 1.2.0 + + Parameters + ---------- + p : float, optional + Normalization in L^p^ space, p = 2 by default. + Examples + -------- >>> from pyspark.mllib.linalg import Vectors >>> v = Vectors.dense(range(3)) >>> nor = Normalizer(1) @@ -71,21 +81,27 @@ class Normalizer(VectorTransformer): >>> nor2 = Normalizer(float("inf")) >>> nor2.transform(v) DenseVector([0.0, 0.5, 1.0]) - - .. versionadded:: 1.2.0 """ def __init__(self, p=2.0): assert p >= 1.0, "p should be greater than 1.0" self.p = float(p) - @since('1.2.0') def transform(self, vector): """ Applies unit length normalization on a vector. - :param vector: vector or RDD of vector to be normalized. - :return: normalized vector. If the norm of the input is zero, it - will return the input vector. + .. versionadded:: 1.2.0 + + Parameters + ---------- + vector : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + vector or RDD of vector to be normalized. + + Returns + ------- + :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + normalized vector(s). If the norm of the input is zero, it + will return the input vector. """ if isinstance(vector, RDD): vector = vector.map(_convert_to_vector) @@ -103,11 +119,16 @@ def transform(self, vector): """ Applies transformation on a vector or an RDD[Vector]. - .. note:: In Python, transform cannot currently be used within - an RDD transformation or action. - Call transform directly on the RDD instead. + Parameters + ---------- + vector : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + Input vector(s) to be transformed. - :param vector: Vector or RDD of Vector to be transformed. + Notes + ----- + In Python, transform cannot currently be used within + an RDD transformation or action. + Call transform directly on the RDD instead. """ if isinstance(vector, RDD): vector = vector.map(_convert_to_vector) @@ -123,19 +144,29 @@ class StandardScalerModel(JavaVectorTransformer): .. versionadded:: 1.2.0 """ - @since('1.2.0') def transform(self, vector): """ Applies standardization transformation on a vector. - .. note:: In Python, transform cannot currently be used within - an RDD transformation or action. - Call transform directly on the RDD instead. + .. versionadded:: 1.2.0 + + Parameters + ---------- + vector : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + Input vector(s) to be standardized. - :param vector: Vector or RDD of Vector to be standardized. - :return: Standardized vector. If the variance of a column is - zero, it will return default `0.0` for the column with - zero variance. + Returns + ------- + :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + Standardized vector(s). If the variance of a column is + zero, it will return default `0.0` for the column with + zero variance. + + Notes + ----- + In Python, transform cannot currently be used within + an RDD transformation or action. + Call transform directly on the RDD instead. """ return JavaVectorTransformer.transform(self, vector) @@ -196,12 +227,20 @@ class StandardScaler(object): variance using column summary statistics on the samples in the training set. - :param withMean: False by default. Centers the data with mean - before scaling. It will build a dense output, so take - care when applying to sparse input. - :param withStd: True by default. Scales the data to unit - standard deviation. + .. versionadded:: 1.2.0 + Parameters + ---------- + withMean : bool, optional + False by default. Centers the data with mean + before scaling. It will build a dense output, so take + care when applying to sparse input. + withStd : bool, optional + True by default. Scales the data to unit + standard deviation. + + Examples + -------- >>> vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])] >>> dataset = sc.parallelize(vs) >>> standardizer = StandardScaler(True, True) @@ -218,8 +257,6 @@ class StandardScaler(object): True >>> model.withMean True - - .. versionadded:: 1.2.0 """ def __init__(self, withMean=False, withStd=True): if not (withMean or withStd): @@ -227,15 +264,22 @@ def __init__(self, withMean=False, withStd=True): self.withMean = withMean self.withStd = withStd - @since('1.2.0') def fit(self, dataset): """ Computes the mean and variance and stores as a model to be used for later scaling. - :param dataset: The data used to compute the mean and variance - to build the transformation model. - :return: a StandardScalarModel + .. versionadded:: 1.2.0 + + Parameters + ---------- + dataset : :py:class:`pyspark.RDD` + The data used to compute the mean and variance + to build the transformation model. + + Returns + ------- + :py:class:`StandardScalerModel` """ dataset = dataset.map(_convert_to_vector) jmodel = callMLlibFunc("fitStandardScaler", self.withMean, self.withStd, dataset) @@ -249,13 +293,21 @@ class ChiSqSelectorModel(JavaVectorTransformer): .. versionadded:: 1.4.0 """ - @since('1.4.0') def transform(self, vector): """ Applies transformation on a vector. - :param vector: Vector or RDD of Vector to be transformed. - :return: transformed vector. + .. versionadded:: 1.4.0 + + Examples + -------- + vector : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + Input vector(s) to be transformed. + + Returns + ------- + :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + transformed vector(s). """ return JavaVectorTransformer.transform(self, vector) @@ -284,6 +336,10 @@ class ChiSqSelector(object): By default, the selection method is `numTopFeatures`, with the default number of top features set to 50. + .. versionadded:: 1.4.0 + + Examples + -------- >>> from pyspark.mllib.linalg import SparseVector, DenseVector >>> from pyspark.mllib.regression import LabeledPoint >>> data = sc.parallelize([ @@ -306,8 +362,6 @@ class ChiSqSelector(object): >>> model = ChiSqSelector(selectorType="percentile", percentile=0.34).fit(data) >>> model.transform(DenseVector([7.0, 9.0, 5.0])) DenseVector([7.0]) - - .. versionadded:: 1.4.0 """ def __init__(self, numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1, fpr=0.05, fdr=0.05, fwe=0.05): @@ -372,15 +426,18 @@ def setSelectorType(self, selectorType): self.selectorType = str(selectorType) return self - @since('1.4.0') def fit(self, data): """ Returns a ChiSquared feature selector. - :param data: an `RDD[LabeledPoint]` containing the labeled dataset - with categorical features. Real-valued features will be - treated as categorical for each distinct value. - Apply feature discretizer before using this function. + .. versionadded:: 1.4.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` of :py:class:`pyspark.mllib.regression.LabeledPoint` + containing the labeled dataset with categorical features. + Real-valued features will be treated as categorical for each + distinct value. Apply feature discretizer before using this function. """ jmodel = callMLlibFunc("fitChiSqSelector", self.selectorType, self.numTopFeatures, self.percentile, self.fpr, self.fdr, self.fwe, data) @@ -399,6 +456,10 @@ class PCA(object): """ A feature transformer that projects vectors to a low-dimensional space using PCA. + .. versionadded:: 1.5.0 + + Examples + -------- >>> data = [Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ... Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ... Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0])] @@ -408,20 +469,26 @@ class PCA(object): 1.648... >>> pcArray[1] -4.013... - - .. versionadded:: 1.5.0 """ def __init__(self, k): """ - :param k: number of principal components. + Parameters + ---------- + k : int + number of principal components. """ self.k = int(k) - @since('1.5.0') def fit(self, data): """ Computes a [[PCAModel]] that contains the principal components of the input vectors. - :param data: source vectors + + .. versionadded:: 1.5.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + source vectors """ jmodel = callMLlibFunc("fitPCA", self.k, data) return PCAModel(jmodel) @@ -432,16 +499,23 @@ class HashingTF(object): Maps a sequence of terms to their term frequencies using the hashing trick. - .. note:: The terms must be hashable (can not be dict/set/list...). + .. versionadded:: 1.2.0 + + Parameters + ---------- + numFeatures : int, optional + number of features (default: 2^20) - :param numFeatures: number of features (default: 2^20) + Notes + ----- + The terms must be hashable (can not be dict/set/list...). + Examples + -------- >>> htf = HashingTF(100) >>> doc = "a a b b c d".split(" ") >>> htf.transform(doc) SparseVector(100, {...}) - - .. versionadded:: 1.2.0 """ def __init__(self, numFeatures=1 << 20): self.numFeatures = numFeatures @@ -485,7 +559,7 @@ class IDFModel(JavaVectorTransformer): .. versionadded:: 1.2.0 """ - @since('1.2.0') + def transform(self, x): """ Transforms term frequency (TF) vectors to TF-IDF vectors. @@ -494,13 +568,24 @@ def transform(self, x): the terms which occur in fewer than `minDocFreq` documents will have an entry of 0. - .. note:: In Python, transform cannot currently be used within - an RDD transformation or action. - Call transform directly on the RDD instead. + .. versionadded:: 1.2.0 + + Parameters + ---------- + x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + an RDD of term frequency vectors or a term frequency + vector - :param x: an RDD of term frequency vectors or a term frequency - vector - :return: an RDD of TF-IDF vectors or a TF-IDF vector + Returns + ------- + :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + an RDD of TF-IDF vectors or a TF-IDF vector + + Notes + ----- + In Python, transform cannot currently be used within + an RDD transformation or action. + Call transform directly on the RDD instead. """ return JavaVectorTransformer.transform(self, x) @@ -539,9 +624,15 @@ class IDF(object): `minDocFreq`). For terms that are not in at least `minDocFreq` documents, the IDF is found as 0, resulting in TF-IDFs of 0. - :param minDocFreq: minimum of documents in which a term - should appear for filtering + .. versionadded:: 1.2.0 + + Parameters + ---------- + minDocFreq : int + minimum of documents in which a term should appear for filtering + Examples + -------- >>> n = 4 >>> freqs = [Vectors.sparse(n, (1, 3), (1.0, 2.0)), ... Vectors.dense([0.0, 1.0, 2.0, 3.0]), @@ -560,18 +651,20 @@ class IDF(object): DenseVector([0.0, 0.0, 1.3863, 0.863]) >>> model.transform(Vectors.sparse(n, (1, 3), (1.0, 2.0))) SparseVector(4, {1: 0.0, 3: 0.5754}) - - .. versionadded:: 1.2.0 """ def __init__(self, minDocFreq=0): self.minDocFreq = minDocFreq - @since('1.2.0') def fit(self, dataset): """ Computes the inverse document frequency. - :param dataset: an RDD of term frequency vectors + .. versionadded:: 1.2.0 + + Parameters + ---------- + dataset : :py:class:`pyspark.RDD` + an RDD of term frequency vectors """ if not isinstance(dataset, RDD): raise TypeError("dataset should be an RDD of term frequency vectors") @@ -582,34 +675,55 @@ def fit(self, dataset): class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader): """ class for Word2Vec model - - .. versionadded:: 1.2.0 """ - @since('1.2.0') + def transform(self, word): """ Transforms a word to its vector representation - .. note:: Local use only + .. versionadded:: 1.2.0 + + Parameters + ---------- + word : str + a word - :param word: a word - :return: vector representation of word(s) + Returns + ------- + :py:class:`pyspark.mllib.linalg.Vector` + vector representation of word(s) + + Notes + ----- + Local use only """ try: return self.call("transform", word) except Py4JJavaError: raise ValueError("%s not found" % word) - @since('1.2.0') def findSynonyms(self, word, num): """ Find synonyms of a word - :param word: a word or a vector representation of word - :param num: number of synonyms to find - :return: array of (word, cosineSimilarity) + .. versionadded:: 1.2.0 + + Parameters + ---------- + + word : str or :py:class:`pyspark.mllib.linalg.Vector` + a word or a vector representation of word + num : int + number of synonyms to find + + Returns + ------- + :py:class:`collections.abc.Iterable` + array of (word, cosineSimilarity) - .. note:: Local use only + Notes + ----- + Local use only """ if not isinstance(word, str): word = _convert_to_vector(word) @@ -653,6 +767,10 @@ class Word2Vec(object): and Distributed Representations of Words and Phrases and their Compositionality. + .. versionadded:: 1.2.0 + + Examples + -------- >>> sentence = "a b " * 100 + "a c " * 10 >>> localDoc = [sentence, sentence] >>> doc = sc.parallelize(localDoc).map(lambda line: line.split(" ")) @@ -686,9 +804,6 @@ class Word2Vec(object): ... rmtree(path) ... except OSError: ... pass - - .. versionadded:: 1.2.0 - """ def __init__(self): """ @@ -761,13 +876,20 @@ def setWindowSize(self, windowSize): self.windowSize = windowSize return self - @since('1.2.0') def fit(self, data): """ Computes the vector representation of each word in vocabulary. - :param data: training data. RDD of list of string - :return: Word2VecModel instance + .. versionadded:: 1.2.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + training data. RDD of list of string + + Returns + ------- + :py:class:`Word2VecModel` """ if not isinstance(data, RDD): raise TypeError("data should be an RDD of list of string") @@ -783,6 +905,10 @@ class ElementwiseProduct(VectorTransformer): Scales each column of the vector, with the supplied weight vector. i.e the elementwise product. + .. versionadded:: 1.5.0 + + Examples + -------- >>> weight = Vectors.dense([1.0, 2.0, 3.0]) >>> eprod = ElementwiseProduct(weight) >>> a = Vectors.dense([2.0, 1.0, 3.0]) @@ -792,8 +918,6 @@ class ElementwiseProduct(VectorTransformer): >>> rdd = sc.parallelize([a, b]) >>> eprod.transform(rdd).collect() [DenseVector([2.0, 2.0, 9.0]), DenseVector([9.0, 6.0, 12.0])] - - .. versionadded:: 1.5.0 """ def __init__(self, scalingVector): self.scalingVector = _convert_to_vector(scalingVector) diff --git a/python/pyspark/mllib/feature.pyi b/python/pyspark/mllib/feature.pyi index 9ccec36abd6ff..24a46f6bee798 100644 --- a/python/pyspark/mllib/feature.pyi +++ b/python/pyspark/mllib/feature.pyi @@ -17,7 +17,7 @@ # under the License. from typing import overload -from typing import Iterable, Hashable, List, Tuple +from typing import Iterable, Hashable, List, Tuple, Union from pyspark.mllib._typing import VectorLike from pyspark.context import SparkContext @@ -135,7 +135,7 @@ class IDF: class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader[Word2VecModel]): def transform(self, word: str) -> Vector: ... # type: ignore - def findSynonyms(self, word: str, num: int) -> Iterable[Tuple[str, float]]: ... + def findSynonyms(self, word: Union[str, VectorLike], num: int) -> Iterable[Tuple[str, float]]: ... def getVectors(self) -> JavaMap: ... @classmethod def load(cls, sc: SparkContext, path: str) -> Word2VecModel: ... diff --git a/python/pyspark/mllib/fpm.py b/python/pyspark/mllib/fpm.py index cbbd7b351b20d..1f87a15cb11c9 100644 --- a/python/pyspark/mllib/fpm.py +++ b/python/pyspark/mllib/fpm.py @@ -32,6 +32,10 @@ class FPGrowthModel(JavaModelWrapper, JavaSaveable, JavaLoader): A FP-Growth model for mining frequent itemsets using the Parallel FP-Growth algorithm. + .. versionadded:: 1.4.0 + + Examples + -------- >>> data = [["a", "b", "c"], ["a", "b", "d", "e"], ["a", "c", "e"], ["a", "c", "f"]] >>> rdd = sc.parallelize(data, 2) >>> model = FPGrowth.train(rdd, 0.6, 2) @@ -42,8 +46,6 @@ class FPGrowthModel(JavaModelWrapper, JavaSaveable, JavaLoader): >>> sameModel = FPGrowthModel.load(sc, model_path) >>> sorted(model.freqItemsets().collect()) == sorted(sameModel.freqItemsets().collect()) True - - .. versionadded:: 1.4.0 """ @since("1.4.0") @@ -72,20 +74,23 @@ class FPGrowth(object): """ @classmethod - @since("1.4.0") def train(cls, data, minSupport=0.3, numPartitions=-1): """ Computes an FP-Growth model that contains frequent itemsets. - :param data: - The input data set, each element contains a transaction. - :param minSupport: - The minimal support level. - (default: 0.3) - :param numPartitions: - The number of partitions used by parallel FP-growth. A value - of -1 will use the same number as input data. - (default: -1) + .. versionadded:: 1.4.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The input data set, each element contains a transaction. + minSupport : float, optional + The minimal support level. + (default: 0.3) + numPartitions : int, optional + The number of partitions used by parallel FP-growth. A value + of -1 will use the same number as input data. + (default: -1) """ model = callMLlibFunc("trainFPGrowthModel", data, float(minSupport), int(numPartitions)) return FPGrowthModel(model) @@ -103,6 +108,10 @@ class PrefixSpanModel(JavaModelWrapper): """ Model fitted by PrefixSpan + .. versionadded:: 1.6.0 + + Examples + -------- >>> data = [ ... [["a", "b"], ["c"]], ... [["a"], ["c", "b"], ["a", "b"]], @@ -112,8 +121,6 @@ class PrefixSpanModel(JavaModelWrapper): >>> model = PrefixSpan.train(rdd) >>> sorted(model.freqSequences().collect()) [FreqSequence(sequence=[['a']], freq=3), FreqSequence(sequence=[['a'], ['a']], freq=1), ... - - .. versionadded:: 1.6.0 """ @since("1.6.0") @@ -125,38 +132,45 @@ def freqSequences(self): class PrefixSpan(object): """ A parallel PrefixSpan algorithm to mine frequent sequential patterns. - The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan: - Mining Sequential Patterns Efficiently by Prefix-Projected Pattern Growth - ([[https://doi.org/10.1109/ICDE.2001.914830]]). + The PrefixSpan algorithm is described in Jian Pei et al (2001) [1]_ .. versionadded:: 1.6.0 + + .. [1] Jian Pei et al., + "PrefixSpan,: mining sequential patterns efficiently by prefix-projected pattern growth," + Proceedings 17th International Conference on Data Engineering, Heidelberg, + Germany, 2001, pp. 215-224, + doi: https://doi.org/10.1109/ICDE.2001.914830 """ @classmethod - @since("1.6.0") def train(cls, data, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000): """ Finds the complete set of frequent sequential patterns in the input sequences of itemsets. - :param data: - The input data set, each element contains a sequence of - itemsets. - :param minSupport: - The minimal support level of the sequential pattern, any - pattern that appears more than (minSupport * - size-of-the-dataset) times will be output. - (default: 0.1) - :param maxPatternLength: - The maximal length of the sequential pattern, any pattern - that appears less than maxPatternLength will be output. - (default: 10) - :param maxLocalProjDBSize: - The maximum number of items (including delimiters used in the - internal storage format) allowed in a projected database before - local processing. If a projected database exceeds this size, - another iteration of distributed prefix growth is run. - (default: 32000000) + .. versionadded:: 1.6.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The input data set, each element contains a sequence of + itemsets. + minSupport : float, optional + The minimal support level of the sequential pattern, any + pattern that appears more than (minSupport * + size-of-the-dataset) times will be output. + (default: 0.1) + maxPatternLength : int, optional + The maximal length of the sequential pattern, any pattern + that appears less than maxPatternLength will be output. + (default: 10) + maxLocalProjDBSize : int, optional + The maximum number of items (including delimiters used in the + internal storage format) allowed in a projected database before + local processing. If a projected database exceeds this size, + another iteration of distributed prefix growth is run. + (default: 32000000) """ model = callMLlibFunc("trainPrefixSpanModel", data, minSupport, maxPatternLength, maxLocalProjDBSize) diff --git a/python/pyspark/mllib/fpm.pyi b/python/pyspark/mllib/fpm.pyi index 880baae1a91a5..c5a6b5f6806c0 100644 --- a/python/pyspark/mllib/fpm.pyi +++ b/python/pyspark/mllib/fpm.pyi @@ -37,8 +37,8 @@ class FPGrowth: cls, data: RDD[List[T]], minSupport: float = ..., numPartitions: int = ... ) -> FPGrowthModel[T]: ... class FreqItemset(Generic[T]): - items = ... # List[T] - freq = ... # int + items: List[T] + freq: int class PrefixSpanModel(JavaModelWrapper, Generic[T]): def freqSequences(self) -> RDD[PrefixSpan.FreqSequence[T]]: ... diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index c1402fb98a50d..f20004ab70ab3 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -71,6 +71,8 @@ def _vector_size(v): """ Returns the size of the vector. + Examples + -------- >>> _vector_size([1., 2., 3.]) 3 >>> _vector_size((1., 2., 3.)) @@ -231,7 +233,9 @@ def toArray(self): """ Convert the vector into an numpy.ndarray - :return: numpy.ndarray + Returns + ------- + :py:class:`numpy.ndarray` """ raise NotImplementedError @@ -240,7 +244,9 @@ def asML(self): Convert this vector to the new mllib-local representation. This does NOT copy the data; it copies references. - :return: :py:class:`pyspark.ml.linalg.Vector` + Returns + ------- + :py:class:`pyspark.ml.linalg.Vector` """ raise NotImplementedError @@ -251,6 +257,8 @@ class DenseVector(Vector): storage and arithmetics will be delegated to the underlying numpy array. + Examples + -------- >>> v = Vectors.dense([1.0, 2.0]) >>> u = Vectors.dense([3.0, 4.0]) >>> v + u @@ -282,6 +290,8 @@ def parse(s): """ Parse string representation back into the DenseVector. + Examples + -------- >>> DenseVector.parse(' [ 0.0,1.0,2.0, 3.0]') DenseVector([0.0, 1.0, 2.0, 3.0]) """ @@ -312,6 +322,8 @@ def norm(self, p): """ Calculates the norm of a DenseVector. + Examples + -------- >>> a = DenseVector([0, -1, 2, -3]) >>> a.norm(2) 3.7... @@ -327,6 +339,8 @@ def dot(self, other): and a target NumPy array that is either 1- or 2-dimensional. Equivalent to calling numpy.dot of the two vectors. + Examples + -------- >>> dense = DenseVector(array.array('d', [1., 2.])) >>> dense.dot(dense) 5.0 @@ -367,6 +381,8 @@ def squared_distance(self, other): """ Squared distance of two Vectors. + Examples + -------- >>> dense1 = DenseVector(array.array('d', [1., 2.])) >>> dense1.squared_distance(dense1) 0.0 @@ -412,9 +428,11 @@ def asML(self): Convert this vector to the new mllib-local representation. This does NOT copy the data; it copies references. - :return: :py:class:`pyspark.ml.linalg.DenseVector` - .. versionadded:: 2.0.0 + + Returns + ------- + :py:class:`pyspark.ml.linalg.DenseVector` """ return newlinalg.DenseVector(self.array) @@ -501,12 +519,18 @@ def __init__(self, size, *args): (index, value) pairs, or two separate arrays of indices and values (sorted by index). - :param size: Size of the vector. - :param args: Active entries, as a dictionary {index: value, ...}, - a list of tuples [(index, value), ...], or a list of strictly - increasing indices and a list of corresponding values [index, ...], - [value, ...]. Inactive entries are treated as zeros. - + Parameters + ---------- + size : int + Size of the vector. + args + Active entries, as a dictionary {index: value, ...}, + a list of tuples [(index, value), ...], or a list of strictly + increasing indices and a list of corresponding values [index, ...], + [value, ...]. Inactive entries are treated as zeros. + + Examples + -------- >>> SparseVector(4, {1: 1.0, 3: 5.5}) SparseVector(4, {1: 1.0, 3: 5.5}) >>> SparseVector(4, [(1, 1.0), (3, 5.5)]) @@ -556,6 +580,8 @@ def norm(self, p): """ Calculates the norm of a SparseVector. + Examples + -------- >>> a = SparseVector(4, [0, 1], [3., -4.]) >>> a.norm(1) 7.0 @@ -574,6 +600,8 @@ def parse(s): """ Parse string representation back into the SparseVector. + Examples + -------- >>> SparseVector.parse(' (4, [0,1 ],[ 4.0,5.0] )') SparseVector(4, {0: 4.0, 1: 5.0}) """ @@ -622,6 +650,8 @@ def dot(self, other): """ Dot product with a SparseVector or 1- or 2-dimensional Numpy array. + Examples + -------- >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) >>> a.dot(a) 25.0 @@ -678,6 +708,8 @@ def squared_distance(self, other): """ Squared distance from a SparseVector or 1-dimensional NumPy array. + Examples + -------- >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) >>> a.squared_distance(a) 0.0 @@ -754,9 +786,11 @@ def asML(self): Convert this vector to the new mllib-local representation. This does NOT copy the data; it copies references. - :return: :py:class:`pyspark.ml.linalg.SparseVector` - .. versionadded:: 2.0.0 + + Returns + ------- + :py:class:`pyspark.ml.linalg.SparseVector` """ return newlinalg.SparseVector(self.size, self.indices, self.values) @@ -828,10 +862,12 @@ class Vectors(object): """ Factory methods for working with vectors. - .. note:: Dense vectors are simply represented as NumPy array objects, - so there is no need to covert them for use in MLlib. For sparse vectors, - the factory methods in this class create an MLlib-compatible type, or users - can pass in SciPy's `scipy.sparse` column vectors. + Notes + ----- + Dense vectors are simply represented as NumPy array objects, + so there is no need to covert them for use in MLlib. For sparse vectors, + the factory methods in this class create an MLlib-compatible type, or users + can pass in SciPy's `scipy.sparse` column vectors. """ @staticmethod @@ -841,10 +877,16 @@ def sparse(size, *args): (index, value) pairs, or two separate arrays of indices and values (sorted by index). - :param size: Size of the vector. - :param args: Non-zero entries, as a dictionary, list of tuples, - or two sorted lists containing indices and values. + Parameters + ---------- + size : int + Size of the vector. + args + Non-zero entries, as a dictionary, list of tuples, + or two sorted lists containing indices and values. + Examples + -------- >>> Vectors.sparse(4, {1: 1.0, 3: 5.5}) SparseVector(4, {1: 1.0, 3: 5.5}) >>> Vectors.sparse(4, [(1, 1.0), (3, 5.5)]) @@ -859,6 +901,8 @@ def dense(*elements): """ Create a dense vector of 64-bit floats from a Python list or numbers. + Examples + -------- >>> Vectors.dense([1, 2, 3]) DenseVector([1.0, 2.0, 3.0]) >>> Vectors.dense(1.0, 2.0) @@ -875,10 +919,15 @@ def fromML(vec): Convert a vector from the new mllib-local representation. This does NOT copy the data; it copies references. - :param vec: a :py:class:`pyspark.ml.linalg.Vector` - :return: a :py:class:`pyspark.mllib.linalg.Vector` - .. versionadded:: 2.0.0 + + Parameters + ---------- + vec : :py:class:`pyspark.ml.linalg.Vector` + + Returns + ------- + :py:class:`pyspark.mllib.linalg.Vector` """ if isinstance(vec, newlinalg.DenseVector): return DenseVector(vec.array) @@ -893,6 +942,8 @@ def stringify(vector): Converts a vector into a string, which can be recognized by Vectors.parse(). + Examples + -------- >>> Vectors.stringify(Vectors.sparse(2, [1], [1.0])) '(2,[1],[1.0])' >>> Vectors.stringify(Vectors.dense([0.0, 1.0])) @@ -907,6 +958,8 @@ def squared_distance(v1, v2): a and b can be of type SparseVector, DenseVector, np.ndarray or array.array. + Examples + -------- >>> a = Vectors.sparse(4, [(0, 1), (3, 4)]) >>> b = Vectors.dense([2, 5, 4, 1]) >>> a.squared_distance(b) @@ -926,6 +979,8 @@ def norm(vector, p): def parse(s): """Parse a string representation back into the Vector. + Examples + -------- >>> Vectors.parse('[2,1,2 ]') DenseVector([2.0, 1.0, 2.0]) >>> Vectors.parse(' ( 100, [0], [2])') @@ -1023,6 +1078,8 @@ def __str__(self): """ Pretty printing of a DenseMatrix + Examples + -------- >>> dm = DenseMatrix(2, 2, range(4)) >>> print(dm) DenseMatrix([[ 0., 2.], @@ -1044,6 +1101,8 @@ def __repr__(self): """ Representation of a DenseMatrix + Examples + -------- >>> dm = DenseMatrix(2, 2, range(4)) >>> dm DenseMatrix(2, 2, [0.0, 1.0, 2.0, 3.0], False) @@ -1067,6 +1126,8 @@ def toArray(self): """ Return an numpy.ndarray + Examples + -------- >>> m = DenseMatrix(2, 2, range(4)) >>> m.toArray() array([[ 0., 2.], @@ -1098,9 +1159,11 @@ def asML(self): Convert this matrix to the new mllib-local representation. This does NOT copy the data; it copies references. - :return: :py:class:`pyspark.ml.linalg.DenseMatrix` - .. versionadded:: 2.0.0 + + Returns + ------- + :py:class:`pyspark.ml.linalg.DenseMatrix` """ return newlinalg.DenseMatrix(self.numRows, self.numCols, self.values, self.isTransposed) @@ -1154,6 +1217,8 @@ def __str__(self): """ Pretty printing of a SparseMatrix + Examples + -------- >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) >>> print(sm1) 2 X 2 CSCMatrix @@ -1200,6 +1265,8 @@ def __repr__(self): """ Representation of a SparseMatrix + Examples + -------- >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) >>> sm1 SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2.0, 3.0, 4.0], False) @@ -1281,9 +1348,11 @@ def asML(self): Convert this matrix to the new mllib-local representation. This does NOT copy the data; it copies references. - :return: :py:class:`pyspark.ml.linalg.SparseMatrix` - .. versionadded:: 2.0.0 + + Returns + ------- + :py:class:`pyspark.ml.linalg.SparseMatrix` """ return newlinalg.SparseMatrix(self.numRows, self.numCols, self.colPtrs, self.rowIndices, self.values, self.isTransposed) @@ -1314,10 +1383,15 @@ def fromML(mat): Convert a matrix from the new mllib-local representation. This does NOT copy the data; it copies references. - :param mat: a :py:class:`pyspark.ml.linalg.Matrix` - :return: a :py:class:`pyspark.mllib.linalg.Matrix` - .. versionadded:: 2.0.0 + + Parameters + ---------- + mat : :py:class:`pyspark.ml.linalg.Matrix` + + Returns + ------- + :py:class:`pyspark.mllib.linalg.Matrix` """ if isinstance(mat, newlinalg.DenseMatrix): return DenseMatrix(mat.numRows, mat.numCols, mat.values, mat.isTransposed) diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py index 603d31d3d7b26..f0e889b15bf51 100644 --- a/python/pyspark/mllib/linalg/distributed.py +++ b/python/pyspark/mllib/linalg/distributed.py @@ -55,16 +55,22 @@ class RowMatrix(DistributedMatrix): Represents a row-oriented distributed Matrix with no meaningful row indices. - :param rows: An RDD or DataFrame of vectors. If a DataFrame is provided, it must have a single - vector typed column. - :param numRows: Number of rows in the matrix. A non-positive - value means unknown, at which point the number - of rows will be determined by the number of - records in the `rows` RDD. - :param numCols: Number of columns in the matrix. A non-positive - value means unknown, at which point the number - of columns will be determined by the size of - the first row. + + Parameters + ---------- + rows : :py:class:`pyspark.RDD` or :py:class:`pyspark.sql.DataFrame` + An RDD or DataFrame of vectors. If a DataFrame is provided, it must have a single + vector typed column. + numRows : int, optional + Number of rows in the matrix. A non-positive + value means unknown, at which point the number + of rows will be determined by the number of + records in the `rows` RDD. + numCols : int, optional + Number of columns in the matrix. A non-positive + value means unknown, at which point the number + of columns will be determined by the size of + the first row. """ def __init__(self, rows, numRows=0, numCols=0): """ @@ -77,6 +83,8 @@ def __init__(self, rows, numRows=0, numCols=0): object, in which case we can wrap it directly. This assists in clean matrix conversions. + Examples + -------- >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]]) >>> mat = RowMatrix(rows) @@ -108,6 +116,8 @@ def rows(self): """ Rows of the RowMatrix stored as an RDD of vectors. + Examples + -------- >>> mat = RowMatrix(sc.parallelize([[1, 2, 3], [4, 5, 6]])) >>> rows = mat.rows >>> rows.first() @@ -119,6 +129,8 @@ def numRows(self): """ Get or compute the number of rows. + Examples + -------- >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6], ... [7, 8, 9], [10, 11, 12]]) @@ -136,6 +148,8 @@ def numCols(self): """ Get or compute the number of cols. + Examples + -------- >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6], ... [7, 8, 9], [10, 11, 12]]) @@ -149,14 +163,19 @@ def numCols(self): """ return self._java_matrix_wrapper.call("numCols") - @since('2.0.0') def computeColumnSummaryStatistics(self): """ Computes column-wise summary statistics. - :return: :class:`MultivariateStatisticalSummary` object - containing column-wise summary statistics. + .. versionadded:: 2.0.0 + + Returns + ------- + :py:class:`MultivariateStatisticalSummary` + object containing column-wise summary statistics. + Examples + -------- >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]]) >>> mat = RowMatrix(rows) @@ -167,14 +186,19 @@ def computeColumnSummaryStatistics(self): java_col_stats = self._java_matrix_wrapper.call("computeColumnSummaryStatistics") return MultivariateStatisticalSummary(java_col_stats) - @since('2.0.0') def computeCovariance(self): """ Computes the covariance matrix, treating each row as an observation. - .. note:: This cannot be computed on matrices with more than 65535 columns. + .. versionadded:: 2.0.0 + + Notes + ----- + This cannot be computed on matrices with more than 65535 columns. + Examples + -------- >>> rows = sc.parallelize([[1, 2], [2, 1]]) >>> mat = RowMatrix(rows) @@ -183,13 +207,18 @@ def computeCovariance(self): """ return self._java_matrix_wrapper.call("computeCovariance") - @since('2.0.0') def computeGramianMatrix(self): """ Computes the Gramian matrix `A^T A`. - .. note:: This cannot be computed on matrices with more than 65535 columns. + .. versionadded:: 2.0.0 + Notes + ----- + This cannot be computed on matrices with more than 65535 columns. + + Examples + -------- >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]]) >>> mat = RowMatrix(rows) @@ -220,11 +249,12 @@ def columnSimilarities(self, threshold=0.0): similarity threshold. To describe the guarantee, we set some notation: - * Let A be the smallest in magnitude non-zero element of - this matrix. - * Let B be the largest in magnitude non-zero element of - this matrix. - * Let L be the maximum number of non-zeros per row. + + - Let A be the smallest in magnitude non-zero element of + this matrix. + - Let B be the largest in magnitude non-zero element of + this matrix. + - Let L be the maximum number of non-zeros per row. For example, for {0,1} matrices: A=B=1. Another example, for the Netflix matrix: A=1, B=5 @@ -236,20 +266,31 @@ def columnSimilarities(self, threshold=0.0): The shuffle size is bounded by the *smaller* of the following two expressions: - * O(n log(n) L / (threshold * A)) - * O(m L^2^) + - O(n log(n) L / (threshold * A)) + - O(m L^2^) The latter is the cost of the brute-force approach, so for non-zero thresholds, the cost is always cheaper than the brute-force approach. - :param: threshold: Set to 0 for deterministic guaranteed - correctness. Similarities above this - threshold are estimated with the cost vs - estimate quality trade-off described above. - :return: An n x n sparse upper-triangular CoordinateMatrix of - cosine similarities between columns of this matrix. + .. versionadded:: 2.0.0 + + Parameters + ---------- + threshold : float, optional + Set to 0 for deterministic guaranteed + correctness. Similarities above this + threshold are estimated with the cost vs + estimate quality trade-off described above. + Returns + ------- + :py:class:`CoordinateMatrix` + An n x n sparse upper-triangular CoordinateMatrix of + cosine similarities between columns of this matrix. + + Examples + -------- >>> rows = sc.parallelize([[1, 2], [1, 5]]) >>> mat = RowMatrix(rows) @@ -260,23 +301,32 @@ def columnSimilarities(self, threshold=0.0): java_sims_mat = self._java_matrix_wrapper.call("columnSimilarities", float(threshold)) return CoordinateMatrix(java_sims_mat) - @since('2.0.0') def tallSkinnyQR(self, computeQ=False): """ Compute the QR decomposition of this RowMatrix. The implementation is designed to optimize the QR decomposition - (factorization) for the RowMatrix of a tall and skinny shape. + (factorization) for the RowMatrix of a tall and skinny shape [1]_. - Reference: - Paul G. Constantine, David F. Gleich. "Tall and skinny QR - factorizations in MapReduce architectures" - ([[https://doi.org/10.1145/1996092.1996103]]) + .. [1] Paul G. Constantine, David F. Gleich. "Tall and skinny QR + factorizations in MapReduce architectures" + https://doi.org/10.1145/1996092.1996103 - :param: computeQ: whether to computeQ - :return: QRDecomposition(Q: RowMatrix, R: Matrix), where - Q = None if computeQ = false. + .. versionadded:: 2.0.0 + Parameters + ---------- + computeQ : bool, optional + whether to computeQ + + Returns + ------- + :py:class:`pyspark.mllib.linalg.QRDecomposition` + QRDecomposition(Q: RowMatrix, R: Matrix), where + Q = None if computeQ = false. + + Examples + -------- >>> rows = sc.parallelize([[3, -6], [4, -8], [0, 1]]) >>> mat = RowMatrix(rows) >>> decomp = mat.tallSkinnyQR(True) @@ -301,7 +351,6 @@ def tallSkinnyQR(self, computeQ=False): R = decomp.call("R") return QRDecomposition(Q, R) - @since('2.2.0') def computeSVD(self, k, computeU=False, rCond=1e-9): """ Computes the singular value decomposition of the RowMatrix. @@ -309,27 +358,39 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): The given row matrix A of dimension (m X n) is decomposed into U * s * V'T where - * U: (m X k) (left singular vectors) is a RowMatrix whose - columns are the eigenvectors of (A X A') - * s: DenseVector consisting of square root of the eigenvalues - (singular values) in descending order. - * v: (n X k) (right singular vectors) is a Matrix whose columns - are the eigenvectors of (A' X A) + - U: (m X k) (left singular vectors) is a RowMatrix whose + columns are the eigenvectors of (A X A') + - s: DenseVector consisting of square root of the eigenvalues + (singular values) in descending order. + - v: (n X k) (right singular vectors) is a Matrix whose columns + are the eigenvectors of (A' X A) For more specific details on implementation, please refer the Scala documentation. - :param k: Number of leading singular values to keep (`0 < k <= n`). - It might return less than k if there are numerically zero singular values - or there are not enough Ritz values converged before the maximum number of - Arnoldi update iterations is reached (in case that matrix A is ill-conditioned). - :param computeU: Whether or not to compute U. If set to be - True, then U is computed by A * V * s^-1 - :param rCond: Reciprocal condition number. All singular values - smaller than rCond * s[0] are treated as zero - where s[0] is the largest singular value. - :returns: :py:class:`SingularValueDecomposition` - + .. versionadded:: 2.2.0 + + Parameters + ---------- + k : int + Number of leading singular values to keep (`0 < k <= n`). + It might return less than k if there are numerically zero singular values + or there are not enough Ritz values converged before the maximum number of + Arnoldi update iterations is reached (in case that matrix A is ill-conditioned). + computeU : bool, optional + Whether or not to compute U. If set to be + True, then U is computed by A * V * s^-1 + rCond : float, optional + Reciprocal condition number. All singular values + smaller than rCond * s[0] are treated as zero + where s[0] is the largest singular value. + + Returns + ------- + :py:class:`SingularValueDecomposition` + + Examples + -------- >>> rows = sc.parallelize([[3, 1, 1], [-1, 3, 1]]) >>> rm = RowMatrix(rows) @@ -345,16 +406,27 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): "computeSVD", int(k), bool(computeU), float(rCond)) return SingularValueDecomposition(j_model) - @since('2.2.0') def computePrincipalComponents(self, k): """ Computes the k principal components of the given row matrix - .. note:: This cannot be computed on matrices with more than 65535 columns. + .. versionadded:: 2.2.0 + + Notes + ----- + This cannot be computed on matrices with more than 65535 columns. - :param k: Number of principal components to keep. - :returns: :py:class:`pyspark.mllib.linalg.DenseMatrix` + Parameters + ---------- + k : int + Number of principal components to keep. + Returns + ------- + :py:class:`pyspark.mllib.linalg.DenseMatrix` + + Examples + -------- >>> rows = sc.parallelize([[1, 2, 3], [2, 4, 5], [3, 6, 1]]) >>> rm = RowMatrix(rows) @@ -370,15 +442,24 @@ def computePrincipalComponents(self, k): """ return self._java_matrix_wrapper.call("computePrincipalComponents", k) - @since('2.2.0') def multiply(self, matrix): """ Multiply this matrix by a local dense matrix on the right. - :param matrix: a local dense matrix whose number of rows must match the number of columns - of this matrix - :returns: :py:class:`RowMatrix` + .. versionadded:: 2.2.0 + + Parameters + ---------- + matrix : :py:class:`pyspark.mllib.linalg.Matrix` + a local dense matrix whose number of rows must match the number of columns + of this matrix + Returns + ------- + :py:class:`RowMatrix` + + Examples + -------- >>> rm = RowMatrix(sc.parallelize([[0, 1], [2, 3]])) >>> rm.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect() [DenseVector([2.0, 3.0]), DenseVector([6.0, 11.0])] @@ -438,8 +519,12 @@ class IndexedRow(object): Just a wrapper over a (int, vector) tuple. - :param index: The index for the given row. - :param vector: The row in the matrix at the given index. + Parameters + ---------- + index : int + The index for the given row. + vector : :py:class:`pyspark.mllib.linalg.Vector` or convertible + The row in the matrix at the given index. """ def __init__(self, index, vector): self.index = int(index) @@ -462,16 +547,21 @@ class IndexedRowMatrix(DistributedMatrix): """ Represents a row-oriented distributed Matrix with indexed rows. - :param rows: An RDD of IndexedRows or (int, vector) tuples or a DataFrame consisting of a - int typed column of indices and a vector typed column. - :param numRows: Number of rows in the matrix. A non-positive - value means unknown, at which point the number - of rows will be determined by the max row - index plus one. - :param numCols: Number of columns in the matrix. A non-positive - value means unknown, at which point the number - of columns will be determined by the size of - the first row. + Parameters + ---------- + rows : :py:class:`pyspark.RDD` + An RDD of IndexedRows or (int, vector) tuples or a DataFrame consisting of a + int typed column of indices and a vector typed column. + numRows : int, optional + Number of rows in the matrix. A non-positive + value means unknown, at which point the number + of rows will be determined by the max row + index plus one. + numCols : int, optional + Number of columns in the matrix. A non-positive + value means unknown, at which point the number + of columns will be determined by the size of + the first row. """ def __init__(self, rows, numRows=0, numCols=0): """ @@ -484,6 +574,8 @@ def __init__(self, rows, numRows=0, numCols=0): object, in which case we can wrap it directly. This assists in clean matrix conversions. + Examples + -------- >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(1, [4, 5, 6])]) >>> mat = IndexedRowMatrix(rows) @@ -524,6 +616,8 @@ def rows(self): """ Rows of the IndexedRowMatrix stored as an RDD of IndexedRows. + Examples + -------- >>> mat = IndexedRowMatrix(sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(1, [4, 5, 6])])) >>> rows = mat.rows @@ -542,6 +636,8 @@ def numRows(self): """ Get or compute the number of rows. + Examples + -------- >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(1, [4, 5, 6]), ... IndexedRow(2, [7, 8, 9]), @@ -561,6 +657,8 @@ def numCols(self): """ Get or compute the number of cols. + Examples + -------- >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(1, [4, 5, 6]), ... IndexedRow(2, [7, 8, 9]), @@ -580,6 +678,8 @@ def columnSimilarities(self): """ Compute all cosine similarities between columns. + Examples + -------- >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(6, [4, 5, 6])]) >>> mat = IndexedRowMatrix(rows) @@ -590,13 +690,18 @@ def columnSimilarities(self): java_coordinate_matrix = self._java_matrix_wrapper.call("columnSimilarities") return CoordinateMatrix(java_coordinate_matrix) - @since('2.0.0') def computeGramianMatrix(self): """ Computes the Gramian matrix `A^T A`. - .. note:: This cannot be computed on matrices with more than 65535 columns. + .. versionadded:: 2.0.0 + + Notes + ----- + This cannot be computed on matrices with more than 65535 columns. + Examples + -------- >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(1, [4, 5, 6])]) >>> mat = IndexedRowMatrix(rows) @@ -610,6 +715,8 @@ def toRowMatrix(self): """ Convert this matrix to a RowMatrix. + Examples + -------- >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(6, [4, 5, 6])]) >>> mat = IndexedRowMatrix(rows).toRowMatrix() @@ -623,6 +730,8 @@ def toCoordinateMatrix(self): """ Convert this matrix to a CoordinateMatrix. + Examples + -------- >>> rows = sc.parallelize([IndexedRow(0, [1, 0]), ... IndexedRow(6, [0, 5])]) >>> mat = IndexedRowMatrix(rows).toCoordinateMatrix() @@ -636,13 +745,19 @@ def toBlockMatrix(self, rowsPerBlock=1024, colsPerBlock=1024): """ Convert this matrix to a BlockMatrix. - :param rowsPerBlock: Number of rows that make up each block. - The blocks forming the final rows are not - required to have the given number of rows. - :param colsPerBlock: Number of columns that make up each block. - The blocks forming the final columns are not - required to have the given number of columns. - + Parameters + ---------- + rowsPerBlock : int, optional + Number of rows that make up each block. + The blocks forming the final rows are not + required to have the given number of rows. + colsPerBlock : int, optional + Number of columns that make up each block. + The blocks forming the final columns are not + required to have the given number of columns. + + Examples + -------- >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]), ... IndexedRow(6, [4, 5, 6])]) >>> mat = IndexedRowMatrix(rows).toBlockMatrix() @@ -661,7 +776,6 @@ def toBlockMatrix(self, rowsPerBlock=1024, colsPerBlock=1024): colsPerBlock) return BlockMatrix(java_block_matrix, rowsPerBlock, colsPerBlock) - @since('2.2.0') def computeSVD(self, k, computeU=False, rCond=1e-9): """ Computes the singular value decomposition of the IndexedRowMatrix. @@ -679,17 +793,29 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): For more specific details on implementation, please refer the scala documentation. - :param k: Number of leading singular values to keep (`0 < k <= n`). - It might return less than k if there are numerically zero singular values - or there are not enough Ritz values converged before the maximum number of - Arnoldi update iterations is reached (in case that matrix A is ill-conditioned). - :param computeU: Whether or not to compute U. If set to be - True, then U is computed by A * V * s^-1 - :param rCond: Reciprocal condition number. All singular values - smaller than rCond * s[0] are treated as zero - where s[0] is the largest singular value. - :returns: SingularValueDecomposition object - + .. versionadded:: 2.2.0 + + Parameters + ---------- + k : int + Number of leading singular values to keep (`0 < k <= n`). + It might return less than k if there are numerically zero singular values + or there are not enough Ritz values converged before the maximum number of + Arnoldi update iterations is reached (in case that matrix A is ill-conditioned). + computeU : bool, optional + Whether or not to compute U. If set to be + True, then U is computed by A * V * s^-1 + rCond : float, optional + Reciprocal condition number. All singular values + smaller than rCond * s[0] are treated as zero + where s[0] is the largest singular value. + + Returns + ------- + :py:class:`SingularValueDecomposition` + + Examples + -------- >>> rows = [(0, (3, 1, 1)), (1, (-1, 3, 1))] >>> irm = IndexedRowMatrix(sc.parallelize(rows)) >>> svd_model = irm.computeSVD(2, True) @@ -705,15 +831,24 @@ def computeSVD(self, k, computeU=False, rCond=1e-9): "computeSVD", int(k), bool(computeU), float(rCond)) return SingularValueDecomposition(j_model) - @since('2.2.0') def multiply(self, matrix): """ Multiply this matrix by a local dense matrix on the right. - :param matrix: a local dense matrix whose number of rows must match the number of columns - of this matrix - :returns: :py:class:`IndexedRowMatrix` + .. versionadded:: 2.2.0 + + Parameters + ---------- + matrix : :py:class:`pyspark.mllib.linalg.Matrix` + a local dense matrix whose number of rows must match the number of columns + of this matrix + Returns + ------- + :py:class:`IndexedRowMatrix` + + Examples + -------- >>> mat = IndexedRowMatrix(sc.parallelize([(0, (0, 1)), (1, (2, 3))])) >>> mat.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect() [IndexedRow(0, [2.0,3.0]), IndexedRow(1, [6.0,11.0])] @@ -730,9 +865,14 @@ class MatrixEntry(object): Just a wrapper over a (int, int, float) tuple. - :param i: The row index of the matrix. - :param j: The column index of the matrix. - :param value: The (i, j)th entry of the matrix, as a float. + Parameters + ---------- + i : int + The row index of the matrix. + j : int + The column index of the matrix. + value : float + The (i, j)th entry of the matrix, as a float. """ def __init__(self, i, j, value): self.i = int(i) @@ -756,16 +896,21 @@ class CoordinateMatrix(DistributedMatrix): """ Represents a matrix in coordinate format. - :param entries: An RDD of MatrixEntry inputs or - (int, int, float) tuples. - :param numRows: Number of rows in the matrix. A non-positive - value means unknown, at which point the number - of rows will be determined by the max row - index plus one. - :param numCols: Number of columns in the matrix. A non-positive - value means unknown, at which point the number - of columns will be determined by the max row - index plus one. + Parameters + ---------- + entries : :py:class:`pyspark.RDD` + An RDD of MatrixEntry inputs or + (int, int, float) tuples. + numRows : int, optional + Number of rows in the matrix. A non-positive + value means unknown, at which point the number + of rows will be determined by the max row + index plus one. + numCols : int, optional + Number of columns in the matrix. A non-positive + value means unknown, at which point the number + of columns will be determined by the max row + index plus one. """ def __init__(self, entries, numRows=0, numCols=0): """ @@ -778,6 +923,8 @@ def __init__(self, entries, numRows=0, numCols=0): object, in which case we can wrap it directly. This assists in clean matrix conversions. + Examples + -------- >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(6, 4, 2.1)]) >>> mat = CoordinateMatrix(entries) @@ -817,6 +964,8 @@ def entries(self): Entries of the CoordinateMatrix stored as an RDD of MatrixEntries. + Examples + -------- >>> mat = CoordinateMatrix(sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(6, 4, 2.1)])) >>> entries = mat.entries @@ -835,6 +984,8 @@ def numRows(self): """ Get or compute the number of rows. + Examples + -------- >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(1, 0, 2), ... MatrixEntry(2, 1, 3.7)]) @@ -853,6 +1004,8 @@ def numCols(self): """ Get or compute the number of cols. + Examples + -------- >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(1, 0, 2), ... MatrixEntry(2, 1, 3.7)]) @@ -867,11 +1020,14 @@ def numCols(self): """ return self._java_matrix_wrapper.call("numCols") - @since('2.0.0') def transpose(self): """ Transpose this CoordinateMatrix. + .. versionadded:: 2.0.0 + + Examples + -------- >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(1, 0, 2), ... MatrixEntry(2, 1, 3.7)]) @@ -891,6 +1047,8 @@ def toRowMatrix(self): """ Convert this matrix to a RowMatrix. + Examples + -------- >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(6, 4, 2.1)]) >>> mat = CoordinateMatrix(entries).toRowMatrix() @@ -915,6 +1073,8 @@ def toIndexedRowMatrix(self): """ Convert this matrix to an IndexedRowMatrix. + Examples + -------- >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(6, 4, 2.1)]) >>> mat = CoordinateMatrix(entries).toIndexedRowMatrix() @@ -938,13 +1098,19 @@ def toBlockMatrix(self, rowsPerBlock=1024, colsPerBlock=1024): """ Convert this matrix to a BlockMatrix. - :param rowsPerBlock: Number of rows that make up each block. - The blocks forming the final rows are not - required to have the given number of rows. - :param colsPerBlock: Number of columns that make up each block. - The blocks forming the final columns are not - required to have the given number of columns. - + Parameters + ---------- + rowsPerBlock : int, optional + Number of rows that make up each block. + The blocks forming the final rows are not + required to have the given number of rows. + colsPerBlock : int, optional + Number of columns that make up each block. + The blocks forming the final columns are not + required to have the given number of columns. + + Examples + -------- >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2), ... MatrixEntry(6, 4, 2.1)]) >>> mat = CoordinateMatrix(entries).toBlockMatrix() @@ -983,26 +1149,33 @@ class BlockMatrix(DistributedMatrix): """ Represents a distributed matrix in blocks of local matrices. - :param blocks: An RDD of sub-matrix blocks - ((blockRowIndex, blockColIndex), sub-matrix) that - form this distributed matrix. If multiple blocks - with the same index exist, the results for - operations like add and multiply will be - unpredictable. - :param rowsPerBlock: Number of rows that make up each block. - The blocks forming the final rows are not - required to have the given number of rows. - :param colsPerBlock: Number of columns that make up each block. - The blocks forming the final columns are not - required to have the given number of columns. - :param numRows: Number of rows of this matrix. If the supplied - value is less than or equal to zero, the number - of rows will be calculated when `numRows` is - invoked. - :param numCols: Number of columns of this matrix. If the supplied - value is less than or equal to zero, the number - of columns will be calculated when `numCols` is - invoked. + Parameters + ---------- + blocks : :py:class:`pyspark.RDD` + An RDD of sub-matrix blocks + ((blockRowIndex, blockColIndex), sub-matrix) that + form this distributed matrix. If multiple blocks + with the same index exist, the results for + operations like add and multiply will be + unpredictable. + rowsPerBlock : int + Number of rows that make up each block. + The blocks forming the final rows are not + required to have the given number of rows. + colsPerBlock : int + Number of columns that make up each block. + The blocks forming the final columns are not + required to have the given number of columns. + numRows : int, optional + Number of rows of this matrix. If the supplied + value is less than or equal to zero, the number + of rows will be calculated when `numRows` is + invoked. + numCols : int, optional + Number of columns of this matrix. If the supplied + value is less than or equal to zero, the number + of columns will be calculated when `numCols` is + invoked. """ def __init__(self, blocks, rowsPerBlock, colsPerBlock, numRows=0, numCols=0): """ @@ -1015,6 +1188,8 @@ def __init__(self, blocks, rowsPerBlock, colsPerBlock, numRows=0, numCols=0): object, in which case we can wrap it directly. This assists in clean matrix conversions. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) >>> mat = BlockMatrix(blocks, 3, 2) @@ -1058,6 +1233,8 @@ def blocks(self): ((blockRowIndex, blockColIndex), sub-matrix) that form this distributed matrix. + Examples + -------- >>> mat = BlockMatrix( ... sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]), 3, 2) @@ -1079,6 +1256,8 @@ def rowsPerBlock(self): """ Number of rows that make up each block. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) >>> mat = BlockMatrix(blocks, 3, 2) @@ -1092,6 +1271,8 @@ def colsPerBlock(self): """ Number of columns that make up each block. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) >>> mat = BlockMatrix(blocks, 3, 2) @@ -1105,6 +1286,8 @@ def numRowBlocks(self): """ Number of rows of blocks in the BlockMatrix. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) >>> mat = BlockMatrix(blocks, 3, 2) @@ -1118,6 +1301,8 @@ def numColBlocks(self): """ Number of columns of blocks in the BlockMatrix. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) >>> mat = BlockMatrix(blocks, 3, 2) @@ -1130,6 +1315,8 @@ def numRows(self): """ Get or compute the number of rows. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) @@ -1147,6 +1334,8 @@ def numCols(self): """ Get or compute the number of cols. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) @@ -1197,6 +1386,8 @@ def add(self, other): two dense sub matrix blocks are added, the output block will also be a DenseMatrix. + Examples + -------- >>> dm1 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6]) >>> dm2 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]) >>> sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [7, 11, 12]) @@ -1220,7 +1411,6 @@ def add(self, other): java_block_matrix = self._java_matrix_wrapper.call("add", other_java_block_matrix) return BlockMatrix(java_block_matrix, self.rowsPerBlock, self.colsPerBlock) - @since('2.0.0') def subtract(self, other): """ Subtracts the given block matrix `other` from this block matrix: @@ -1232,6 +1422,10 @@ def subtract(self, other): If two dense sub matrix blocks are subtracted, the output block will also be a DenseMatrix. + .. versionadded:: 2.0.0 + + Examples + -------- >>> dm1 = Matrices.dense(3, 2, [3, 1, 5, 4, 6, 2]) >>> dm2 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]) >>> sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [1, 2, 3]) @@ -1265,6 +1459,8 @@ def multiply(self, other): This may cause some performance issues until support for multiplying two sparse matrices is added. + Examples + -------- >>> dm1 = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6]) >>> dm2 = Matrices.dense(2, 3, [7, 8, 9, 10, 11, 12]) >>> dm3 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6]) @@ -1290,12 +1486,15 @@ def multiply(self, other): java_block_matrix = self._java_matrix_wrapper.call("multiply", other_java_block_matrix) return BlockMatrix(java_block_matrix, self.rowsPerBlock, self.colsPerBlock) - @since('2.0.0') def transpose(self): """ Transpose this BlockMatrix. Returns a new BlockMatrix instance sharing the same underlying data. Is a lazy operation. + .. versionadded:: 2.0.0 + + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) >>> mat = BlockMatrix(blocks, 3, 2) @@ -1311,6 +1510,8 @@ def toLocalMatrix(self): """ Collect the distributed matrix on the driver as a DenseMatrix. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) >>> mat = BlockMatrix(blocks, 3, 2).toLocalMatrix() @@ -1333,6 +1534,8 @@ def toIndexedRowMatrix(self): """ Convert this matrix to an IndexedRowMatrix. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) >>> mat = BlockMatrix(blocks, 3, 2).toIndexedRowMatrix() @@ -1356,6 +1559,8 @@ def toCoordinateMatrix(self): """ Convert this matrix to a CoordinateMatrix. + Examples + -------- >>> blocks = sc.parallelize([((0, 0), Matrices.dense(1, 2, [1, 2])), ... ((1, 0), Matrices.dense(1, 2, [7, 8]))]) >>> mat = BlockMatrix(blocks, 1, 2).toCoordinateMatrix() diff --git a/python/pyspark/mllib/linalg/distributed.pyi b/python/pyspark/mllib/linalg/distributed.pyi index 238c4ea32e4e8..7ec2d60c5a947 100644 --- a/python/pyspark/mllib/linalg/distributed.pyi +++ b/python/pyspark/mllib/linalg/distributed.pyi @@ -22,6 +22,7 @@ from pyspark.storagelevel import StorageLevel from pyspark.mllib.common import JavaModelWrapper from pyspark.mllib.linalg import Vector, Matrix, QRDecomposition from pyspark.mllib.stat import MultivariateStatisticalSummary +import pyspark.sql.dataframe from numpy import ndarray # noqa: F401 VectorLike = Union[Vector, Sequence[Union[float, int]]] @@ -35,7 +36,10 @@ class DistributedMatrix: class RowMatrix(DistributedMatrix): def __init__( - self, rows: RDD[Vector], numRows: int = ..., numCols: int = ... + self, + rows: Union[RDD[Vector], pyspark.sql.dataframe.DataFrame], + numRows: int = ..., + numCols: int = ..., ) -> None: ... @property def rows(self) -> RDD[Vector]: ... diff --git a/python/pyspark/mllib/random.py b/python/pyspark/mllib/random.py index 6106c58584882..a33dfe26fbad9 100644 --- a/python/pyspark/mllib/random.py +++ b/python/pyspark/mllib/random.py @@ -22,7 +22,6 @@ import sys from functools import wraps -from pyspark import since from pyspark.mllib.common import callMLlibFunc @@ -46,7 +45,6 @@ class RandomRDDs(object): """ @staticmethod - @since("1.1.0") def uniformRDD(sc, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the @@ -56,12 +54,26 @@ def uniformRDD(sc, size, numPartitions=None, seed=None): to U(a, b), use ``RandomRDDs.uniformRDD(sc, n, p, seed).map(lambda v: a + (b - a) * v)`` - :param sc: SparkContext used to create the RDD. - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ `U(0.0, 1.0)`. - + .. versionadded:: 1.1.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + used to create the RDD. + size : int + Size of the RDD. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of float comprised of i.i.d. samples ~ `U(0.0, 1.0)`. + + Examples + -------- >>> x = RandomRDDs.uniformRDD(sc, 100).collect() >>> len(x) 100 @@ -76,7 +88,6 @@ def uniformRDD(sc, size, numPartitions=None, seed=None): return callMLlibFunc("uniformRDD", sc._jsc, size, numPartitions, seed) @staticmethod - @since("1.1.0") def normalRDD(sc, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the standard normal @@ -86,12 +97,26 @@ def normalRDD(sc, size, numPartitions=None, seed=None): to some other normal N(mean, sigma^2), use ``RandomRDDs.normal(sc, n, p, seed).map(lambda v: mean + sigma * v)`` - :param sc: SparkContext used to create the RDD. - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ N(0.0, 1.0). - + .. versionadded:: 1.1.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + used to create the RDD. + size : int + Size of the RDD. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of float comprised of i.i.d. samples ~ N(0.0, 1.0). + + Examples + -------- >>> x = RandomRDDs.normalRDD(sc, 1000, seed=1) >>> stats = x.stats() >>> stats.count() @@ -104,20 +129,34 @@ def normalRDD(sc, size, numPartitions=None, seed=None): return callMLlibFunc("normalRDD", sc._jsc, size, numPartitions, seed) @staticmethod - @since("1.3.0") def logNormalRDD(sc, mean, std, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the log normal distribution with the input mean and standard distribution. - :param sc: SparkContext used to create the RDD. - :param mean: mean for the log Normal distribution - :param std: std for the log Normal distribution - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ log N(mean, std). - + .. versionadded:: 1.3.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + used to create the RDD. + mean : float + mean for the log Normal distribution + std : float + std for the log Normal distribution + size : int + Size of the RDD. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + RDD of float comprised of i.i.d. samples ~ log N(mean, std). + + Examples + -------- >>> from math import sqrt, exp >>> mean = 0.0 >>> std = 1.0 @@ -137,19 +176,33 @@ def logNormalRDD(sc, mean, std, size, numPartitions=None, seed=None): size, numPartitions, seed) @staticmethod - @since("1.1.0") def poissonRDD(sc, mean, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the Poisson distribution with the input mean. - :param sc: SparkContext used to create the RDD. - :param mean: Mean, or lambda, for the Poisson distribution. - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ Pois(mean). - + .. versionadded:: 1.1.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + mean : float + Mean, or lambda, for the Poisson distribution. + size : int + Size of the RDD. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of float comprised of i.i.d. samples ~ Pois(mean). + + Examples + -------- >>> mean = 100.0 >>> x = RandomRDDs.poissonRDD(sc, mean, 1000, seed=2) >>> stats = x.stats() @@ -164,19 +217,33 @@ def poissonRDD(sc, mean, size, numPartitions=None, seed=None): return callMLlibFunc("poissonRDD", sc._jsc, float(mean), size, numPartitions, seed) @staticmethod - @since("1.3.0") def exponentialRDD(sc, mean, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the Exponential distribution with the input mean. - :param sc: SparkContext used to create the RDD. - :param mean: Mean, or 1 / lambda, for the Exponential distribution. - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ Exp(mean). - + .. versionadded:: 1.3.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + mean : float + Mean, or 1 / lambda, for the Exponential distribution. + size : int + Size of the RDD. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of float comprised of i.i.d. samples ~ Exp(mean). + + Examples + -------- >>> mean = 2.0 >>> x = RandomRDDs.exponentialRDD(sc, mean, 1000, seed=2) >>> stats = x.stats() @@ -191,20 +258,35 @@ def exponentialRDD(sc, mean, size, numPartitions=None, seed=None): return callMLlibFunc("exponentialRDD", sc._jsc, float(mean), size, numPartitions, seed) @staticmethod - @since("1.3.0") def gammaRDD(sc, shape, scale, size, numPartitions=None, seed=None): """ Generates an RDD comprised of i.i.d. samples from the Gamma distribution with the input shape and scale. - :param sc: SparkContext used to create the RDD. - :param shape: shape (> 0) parameter for the Gamma distribution - :param scale: scale (> 0) parameter for the Gamma distribution - :param size: Size of the RDD. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of float comprised of i.i.d. samples ~ Gamma(shape, scale). - + .. versionadded:: 1.3.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + shape : float + shape (> 0) parameter for the Gamma distribution + scale : float + scale (> 0) parameter for the Gamma distribution + size : int + Size of the RDD. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of float comprised of i.i.d. samples ~ Gamma(shape, scale). + + Examples + -------- >>> from math import sqrt >>> shape = 1.0 >>> scale = 2.0 @@ -224,19 +306,33 @@ def gammaRDD(sc, shape, scale, size, numPartitions=None, seed=None): @staticmethod @toArray - @since("1.1.0") def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the uniform distribution U(0.0, 1.0). - :param sc: SparkContext used to create the RDD. - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD. - :param seed: Seed for the RNG that generates the seed for the generator in each partition. - :return: RDD of Vector with vectors containing i.i.d samples ~ `U(0.0, 1.0)`. - + .. versionadded:: 1.1.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + numRows : int + Number of Vectors in the RDD. + numCols : int + Number of elements in each Vector. + numPartitions : int, optional + Number of partitions in the RDD. + seed : int, optional + Seed for the RNG that generates the seed for the generator in each partition. + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of Vector with vectors containing i.i.d samples ~ `U(0.0, 1.0)`. + + Examples + -------- >>> import numpy as np >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect()) >>> mat.shape @@ -250,19 +346,33 @@ def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): @staticmethod @toArray - @since("1.1.0") def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the standard normal distribution. - :param sc: SparkContext used to create the RDD. - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of Vector with vectors containing i.i.d. samples ~ `N(0.0, 1.0)`. - + .. versionadded:: 1.1.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + numRows : int + Number of Vectors in the RDD. + numCols : int + Number of elements in each Vector. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of Vector with vectors containing i.i.d. samples ~ `N(0.0, 1.0)`. + + Examples + -------- >>> import numpy as np >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1).collect()) >>> mat.shape @@ -276,21 +386,37 @@ def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): @staticmethod @toArray - @since("1.3.0") def logNormalVectorRDD(sc, mean, std, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the log normal distribution. - :param sc: SparkContext used to create the RDD. - :param mean: Mean of the log normal distribution - :param std: Standard Deviation of the log normal distribution - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of Vector with vectors containing i.i.d. samples ~ log `N(mean, std)`. - + .. versionadded:: 1.3.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + mean : float + Mean of the log normal distribution + std : float + Standard Deviation of the log normal distribution + numRows : int + Number of Vectors in the RDD. + numCols : int + Number of elements in each Vector. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of Vector with vectors containing i.i.d. samples ~ log `N(mean, std)`. + + Examples + -------- >>> import numpy as np >>> from math import sqrt, exp >>> mean = 0.0 @@ -311,20 +437,35 @@ def logNormalVectorRDD(sc, mean, std, numRows, numCols, numPartitions=None, seed @staticmethod @toArray - @since("1.1.0") def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the Poisson distribution with the input mean. - :param sc: SparkContext used to create the RDD. - :param mean: Mean, or lambda, for the Poisson distribution. - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`) - :param seed: Random seed (default: a random long integer). - :return: RDD of Vector with vectors containing i.i.d. samples ~ Pois(mean). - + .. versionadded:: 1.1.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + mean : float + Mean, or lambda, for the Poisson distribution. + numRows : float + Number of Vectors in the RDD. + numCols : int + Number of elements in each Vector. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`) + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of Vector with vectors containing i.i.d. samples ~ Pois(mean). + + Examples + -------- >>> import numpy as np >>> mean = 100.0 >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1) @@ -342,20 +483,35 @@ def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None): @staticmethod @toArray - @since("1.3.0") def exponentialVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the Exponential distribution with the input mean. - :param sc: SparkContext used to create the RDD. - :param mean: Mean, or 1 / lambda, for the Exponential distribution. - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`) - :param seed: Random seed (default: a random long integer). - :return: RDD of Vector with vectors containing i.i.d. samples ~ Exp(mean). - + .. versionadded:: 1.3.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + mean : float + Mean, or 1 / lambda, for the Exponential distribution. + numRows : int + Number of Vectors in the RDD. + numCols : int + Number of elements in each Vector. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`) + seed : int, optional + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of Vector with vectors containing i.i.d. samples ~ Exp(mean). + + Examples + -------- >>> import numpy as np >>> mean = 0.5 >>> rdd = RandomRDDs.exponentialVectorRDD(sc, mean, 100, 100, seed=1) @@ -373,21 +529,37 @@ def exponentialVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=No @staticmethod @toArray - @since("1.3.0") def gammaVectorRDD(sc, shape, scale, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the Gamma distribution. - :param sc: SparkContext used to create the RDD. - :param shape: Shape (> 0) of the Gamma distribution - :param scale: Scale (> 0) of the Gamma distribution - :param numRows: Number of Vectors in the RDD. - :param numCols: Number of elements in each Vector. - :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`). - :param seed: Random seed (default: a random long integer). - :return: RDD of Vector with vectors containing i.i.d. samples ~ Gamma(shape, scale). - + .. versionadded:: 1.3.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + SparkContext used to create the RDD. + shape : float + Shape (> 0) of the Gamma distribution + scale : float + Scale (> 0) of the Gamma distribution + numRows : int + Number of Vectors in the RDD. + numCols : int + Number of elements in each Vector. + numPartitions : int, optional + Number of partitions in the RDD (default: `sc.defaultParallelism`). + seed : int, optional, + Random seed (default: a random long integer). + + Returns + ------- + :py:class:`pyspark.RDD` + RDD of Vector with vectors containing i.i.d. samples ~ Gamma(shape, scale). + + Examples + -------- >>> import numpy as np >>> from math import sqrt >>> shape = 1.0 diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py index 3dd7cb200c280..7a5fb6e6eea9e 100644 --- a/python/pyspark/mllib/recommendation.py +++ b/python/pyspark/mllib/recommendation.py @@ -32,13 +32,15 @@ class Rating(namedtuple("Rating", ["user", "product", "rating"])): """ Represents a (user, product, rating) tuple. + .. versionadded:: 1.2.0 + + Examples + -------- >>> r = Rating(1, 2, 5.0) >>> (r.user, r.product, r.rating) (1, 2, 5.0) >>> (r[0], r[1], r[2]) (1, 2, 5.0) - - .. versionadded:: 1.2.0 """ def __reduce__(self): @@ -51,6 +53,10 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader): """A matrix factorisation model trained by regularized alternating least-squares. + .. versionadded:: 0.9.0 + + Examples + -------- >>> r1 = (1, 1, 1.0) >>> r2 = (1, 2, 2.0) >>> r3 = (2, 1, 2.0) @@ -126,8 +132,6 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader): ... rmtree(path) ... except OSError: ... pass - - .. versionadded:: 0.9.0 """ @since("0.9.0") def predict(self, user, product): @@ -237,7 +241,6 @@ def _prepare(cls, ratings): return ratings @classmethod - @since("0.9.0") def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False, seed=None): """ @@ -247,35 +250,38 @@ def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative features). To solve for these features, ALS is run iteratively with a configurable level of parallelism. - :param ratings: - RDD of `Rating` or (userID, productID, rating) tuple. - :param rank: - Number of features to use (also referred to as the number of latent factors). - :param iterations: - Number of iterations of ALS. - (default: 5) - :param lambda_: - Regularization parameter. - (default: 0.01) - :param blocks: - Number of blocks used to parallelize the computation. A value - of -1 will use an auto-configured number of blocks. - (default: -1) - :param nonnegative: - A value of True will solve least-squares with nonnegativity - constraints. - (default: False) - :param seed: - Random seed for initial matrix factorization model. A value - of None will use system time as the seed. - (default: None) + .. versionadded:: 0.9.0 + + Parameters + ---------- + ratings : :py:class:`pyspark.RDD` + RDD of `Rating` or (userID, productID, rating) tuple. + rank : int + Number of features to use (also referred to as the number of latent factors). + iterations : int, optional + Number of iterations of ALS. + (default: 5) + lambda\\_ : float, optional + Regularization parameter. + (default: 0.01) + blocks : int, optional + Number of blocks used to parallelize the computation. A value + of -1 will use an auto-configured number of blocks. + (default: -1) + nonnegative : bool, optional + A value of True will solve least-squares with nonnegativity + constraints. + (default: False) + seed : bool, optional + Random seed for initial matrix factorization model. A value + of None will use system time as the seed. + (default: None) """ model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations, lambda_, blocks, nonnegative, seed) return MatrixFactorizationModel(model) @classmethod - @since("0.9.0") def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01, nonnegative=False, seed=None): """ @@ -285,31 +291,35 @@ def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alp given rank (number of features). To solve for these features, ALS is run iteratively with a configurable level of parallelism. - :param ratings: - RDD of `Rating` or (userID, productID, rating) tuple. - :param rank: - Number of features to use (also referred to as the number of latent factors). - :param iterations: - Number of iterations of ALS. - (default: 5) - :param lambda_: - Regularization parameter. - (default: 0.01) - :param blocks: - Number of blocks used to parallelize the computation. A value - of -1 will use an auto-configured number of blocks. - (default: -1) - :param alpha: - A constant used in computing confidence. - (default: 0.01) - :param nonnegative: - A value of True will solve least-squares with nonnegativity - constraints. - (default: False) - :param seed: - Random seed for initial matrix factorization model. A value - of None will use system time as the seed. - (default: None) + .. versionadded:: 0.9.0 + + Parameters + ---------- + ratings : :py:class:`pyspark.RDD` + RDD of `Rating` or (userID, productID, rating) tuple. + rank : int + Number of features to use (also referred to as the number of latent factors). + iterations : int, optional + Number of iterations of ALS. + (default: 5) + lambda\\_ : float, optional + Regularization parameter. + (default: 0.01) + blocks : int, optional + Number of blocks used to parallelize the computation. A value + of -1 will use an auto-configured number of blocks. + (default: -1) + alpha : float, optional + A constant used in computing confidence. + (default: 0.01) + nonnegative : bool, optional + A value of True will solve least-squares with nonnegativity + constraints. + (default: False) + seed : int, optional + Random seed for initial matrix factorization model. A value + of None will use system time as the seed. + (default: None) """ model = callMLlibFunc("trainImplicitALSModel", cls._prepare(ratings), rank, iterations, lambda_, blocks, alpha, nonnegative, seed) diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index 77bca86ac1b27..e549b0ac43721 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -39,15 +39,19 @@ class LabeledPoint(object): """ Class that represents the features and labels of a data point. - :param label: - Label for this data point. - :param features: - Vector of features for this point (NumPy array, list, - pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix). - - .. note:: 'label' and 'features' are accessible as class attributes. - .. versionadded:: 1.0.0 + + Parameters + ---------- + label : int + Label for this data point. + features : :py:class:`pyspark.mllib.linalg.Vector` or convertible + Vector of features for this point (NumPy array, list, + pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix). + + Notes + ----- + 'label' and 'features' are accessible as class attributes. """ def __init__(self, label, features): @@ -69,12 +73,14 @@ class LinearModel(object): """ A linear model that has a vector of coefficients and an intercept. - :param weights: - Weights computed for every feature. - :param intercept: - Intercept computed for this model. - .. versionadded:: 0.9.0 + + Parameters + ---------- + weights : :py:class:`pyspark.mllib.linalg.Vector` + Weights computed for every feature. + intercept : float + Intercept computed for this model. """ def __init__(self, weights, intercept): @@ -102,14 +108,16 @@ class LinearRegressionModelBase(LinearModel): """A linear regression model. + .. versionadded:: 0.9.0 + + Examples + -------- >>> from pyspark.mllib.linalg import SparseVector >>> lrmb = LinearRegressionModelBase(np.array([1.0, 2.0]), 0.1) >>> abs(lrmb.predict(np.array([-1.03, 7.777])) - 14.624) < 1e-6 True >>> abs(lrmb.predict(SparseVector(2, {0: -1.03, 1: 7.777})) - 14.624) < 1e-6 True - - .. versionadded:: 0.9.0 """ @since("0.9.0") @@ -129,6 +137,10 @@ class LinearRegressionModel(LinearRegressionModelBase): """A linear regression model derived from a least-squares fit. + .. versionadded:: 0.9.0 + + Examples + -------- >>> from pyspark.mllib.linalg import SparseVector >>> from pyspark.mllib.regression import LabeledPoint >>> data = [ @@ -181,8 +193,6 @@ class LinearRegressionModel(LinearRegressionModelBase): True >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 True - - .. versionadded:: 0.9.0 """ @since("1.4.0") def save(self, sc, path): @@ -224,11 +234,13 @@ def _regression_train_wrapper(train_func, modelClass, data, initial_weights): class LinearRegressionWithSGD(object): """ + Train a linear regression model with no regularization using Stochastic Gradient Descent. + .. versionadded:: 0.9.0 - .. note:: Deprecated in 2.0.0. Use ml.regression.LinearRegression. + .. deprecated:: 2.0.0 + Use :py:class:`pyspark.ml.regression.LinearRegression`. """ @classmethod - @since("0.9.0") def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, initialWeights=None, regParam=0.0, regType=None, intercept=False, validateData=True, convergenceTol=0.001): @@ -244,42 +256,47 @@ def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, corresponding right hand side label y. See also the documentation for the precise formulation. - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param step: - The step parameter used in SGD. - (default: 1.0) - :param miniBatchFraction: - Fraction of data to be used for each SGD iteration. - (default: 1.0) - :param initialWeights: - The initial weights. - (default: None) - :param regParam: - The regularizer parameter. - (default: 0.0) - :param regType: - The type of regularizer used for training our model. - Supported values: + .. versionadded:: 0.9.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The training data, an RDD of LabeledPoint. + iterations : int, optional + The number of iterations. + (default: 100) + step : float, optional + The step parameter used in SGD. + (default: 1.0) + miniBatchFraction : float, optional + Fraction of data to be used for each SGD iteration. + (default: 1.0) + initialWeights : :py:class:`pyspark.mllib.linalg.Vector` or convertible, optional + The initial weights. + (default: None) + regParam : float, optional + The regularizer parameter. + (default: 0.0) + regType : str, optional + The type of regularizer used for training our model. + Supported values: - "l1" for using L1 regularization - "l2" for using L2 regularization - None for no regularization (default) - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e., whether bias - features are activated or not). - (default: False) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param convergenceTol: - A condition which decides iteration termination. - (default: 0.001) + + intercept : bool, optional + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e., whether bias + features are activated or not). + (default: False) + validateData : bool, optional + Boolean parameter which indicates if the algorithm should + validate data before training. + (default: True) + convergenceTol : float, optional + A condition which decides iteration termination. + (default: 0.001) """ warnings.warn( "Deprecated in 2.0.0. Use ml.regression.LinearRegression.", DeprecationWarning) @@ -299,6 +316,10 @@ class LassoModel(LinearRegressionModelBase): """A linear regression model derived from a least-squares fit with an l_1 penalty term. + .. versionadded:: 0.9.0 + + Examples + -------- >>> from pyspark.mllib.linalg import SparseVector >>> from pyspark.mllib.regression import LabeledPoint >>> data = [ @@ -351,8 +372,6 @@ class LassoModel(LinearRegressionModelBase): True >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 True - - .. versionadded:: 0.9.0 """ @since("1.4.0") def save(self, sc, path): @@ -375,12 +394,14 @@ def load(cls, sc, path): class LassoWithSGD(object): """ + Train a regression model with L1-regularization using Stochastic Gradient Descent. + .. versionadded:: 0.9.0 - .. note:: Deprecated in 2.0.0. Use ml.regression.LinearRegression with elasticNetParam = 1.0. - Note the default regParam is 0.01 for LassoWithSGD, but is 0.0 for LinearRegression. + .. deprecated:: 2.0.0 + Use :py:class:`pyspark.ml.regression.LinearRegression` with elasticNetParam = 1.0. + Note the default regParam is 0.01 for LassoWithSGD, but is 0.0 for LinearRegression. """ @classmethod - @since("0.9.0") def train(cls, data, iterations=100, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None, intercept=False, validateData=True, convergenceTol=0.001): @@ -395,35 +416,39 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01, of rows of A, each with its corresponding right hand side label y. See also the documentation for the precise formulation. - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param step: - The step parameter used in SGD. - (default: 1.0) - :param regParam: - The regularizer parameter. - (default: 0.01) - :param miniBatchFraction: - Fraction of data to be used for each SGD iteration. - (default: 1.0) - :param initialWeights: - The initial weights. - (default: None) - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e. whether bias - features are activated or not). - (default: False) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param convergenceTol: - A condition which decides iteration termination. - (default: 0.001) + .. versionadded:: 0.9.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The training data, an RDD of LabeledPoint. + iterations : int, optional + The number of iterations. + (default: 100) + step : float, optional + The step parameter used in SGD. + (default: 1.0) + regParam : float, optional + The regularizer parameter. + (default: 0.01) + miniBatchFraction : float, optional + Fraction of data to be used for each SGD iteration. + (default: 1.0) + initialWeights : :py:class:`pyspark.mllib.linalg.Vector` or convertible, optional + The initial weights. + (default: None) + intercept : bool, optional + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e. whether bias + features are activated or not). + (default: False) + validateData : bool, optional + Boolean parameter which indicates if the algorithm should + validate data before training. + (default: True) + convergenceTol : float, optional + A condition which decides iteration termination. + (default: 0.001) """ warnings.warn( "Deprecated in 2.0.0. Use ml.regression.LinearRegression with elasticNetParam = 1.0. " @@ -444,6 +469,10 @@ class RidgeRegressionModel(LinearRegressionModelBase): """A linear regression model derived from a least-squares fit with an l_2 penalty term. + .. versionadded:: 0.9.0 + + Examples + -------- >>> from pyspark.mllib.linalg import SparseVector >>> from pyspark.mllib.regression import LabeledPoint >>> data = [ @@ -496,8 +525,6 @@ class RidgeRegressionModel(LinearRegressionModelBase): True >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 True - - .. versionadded:: 0.9.0 """ @since("1.4.0") def save(self, sc, path): @@ -520,13 +547,15 @@ def load(cls, sc, path): class RidgeRegressionWithSGD(object): """ + Train a regression model with L2-regularization using Stochastic Gradient Descent. + .. versionadded:: 0.9.0 - .. note:: Deprecated in 2.0.0. Use ml.regression.LinearRegression with elasticNetParam = 0.0. - Note the default regParam is 0.01 for RidgeRegressionWithSGD, but is 0.0 for - LinearRegression. + .. deprecated:: 2.0.0 + Use :py:class:`pyspark.ml.regression.LinearRegression` with elasticNetParam = 0.0. + Note the default regParam is 0.01 for RidgeRegressionWithSGD, but is 0.0 for + LinearRegression. """ @classmethod - @since("0.9.0") def train(cls, data, iterations=100, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None, intercept=False, validateData=True, convergenceTol=0.001): @@ -541,35 +570,39 @@ def train(cls, data, iterations=100, step=1.0, regParam=0.01, of rows of A, each with its corresponding right hand side label y. See also the documentation for the precise formulation. - :param data: - The training data, an RDD of LabeledPoint. - :param iterations: - The number of iterations. - (default: 100) - :param step: - The step parameter used in SGD. - (default: 1.0) - :param regParam: - The regularizer parameter. - (default: 0.01) - :param miniBatchFraction: - Fraction of data to be used for each SGD iteration. - (default: 1.0) - :param initialWeights: - The initial weights. - (default: None) - :param intercept: - Boolean parameter which indicates the use or not of the - augmented representation for training data (i.e. whether bias - features are activated or not). - (default: False) - :param validateData: - Boolean parameter which indicates if the algorithm should - validate data before training. - (default: True) - :param convergenceTol: - A condition which decides iteration termination. - (default: 0.001) + .. versionadded:: 0.9.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + The training data, an RDD of LabeledPoint. + iterations : int, optional + The number of iterations. + (default: 100) + step : float, optional + The step parameter used in SGD. + (default: 1.0) + regParam : float, optional + The regularizer parameter. + (default: 0.01) + miniBatchFraction : float, optional + Fraction of data to be used for each SGD iteration. + (default: 1.0) + initialWeights : :py:class:`pyspark.mllib.linalg.Vector` or convertible, optional + The initial weights. + (default: None) + intercept : bool, optional + Boolean parameter which indicates the use or not of the + augmented representation for training data (i.e. whether bias + features are activated or not). + (default: False) + validateData : bool, optional + Boolean parameter which indicates if the algorithm should + validate data before training. + (default: True) + convergenceTol : float, optional + A condition which decides iteration termination. + (default: 0.001) """ warnings.warn( "Deprecated in 2.0.0. Use ml.regression.LinearRegression with elasticNetParam = 0.0. " @@ -589,15 +622,21 @@ class IsotonicRegressionModel(Saveable, Loader): """ Regression model for isotonic regression. - :param boundaries: - Array of boundaries for which predictions are known. Boundaries - must be sorted in increasing order. - :param predictions: - Array of predictions associated to the boundaries at the same - index. Results of isotonic regression and therefore monotone. - :param isotonic: - Indicates whether this is isotonic or antitonic. + .. versionadded:: 1.4.0 + Parameters + ---------- + boundaries : ndarray + Array of boundaries for which predictions are known. Boundaries + must be sorted in increasing order. + predictions : ndarray + Array of predictions associated to the boundaries at the same + index. Results of isotonic regression and therefore monotone. + isotonic : true + Indicates whether this is isotonic or antitonic. + + Examples + -------- >>> data = [(1, 0, 1), (2, 1, 1), (3, 2, 1), (1, 3, 1), (6, 4, 1), (17, 5, 1), (16, 6, 1)] >>> irm = IsotonicRegression.train(sc.parallelize(data)) >>> irm.predict(3) @@ -619,8 +658,6 @@ class IsotonicRegressionModel(Saveable, Loader): ... rmtree(path) ... except OSError: ... pass - - .. versionadded:: 1.4.0 """ def __init__(self, boundaries, predictions, isotonic): @@ -628,7 +665,6 @@ def __init__(self, boundaries, predictions, isotonic): self.predictions = predictions self.isotonic = isotonic - @since("1.4.0") def predict(self, x): """ Predict labels for provided features. @@ -647,8 +683,13 @@ def predict(self, x): values with the same boundary then the same rules as in 2) are used. - :param x: - Feature or RDD of Features to be labeled. + + .. versionadded:: 1.4.0 + + Parameters + ---------- + x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + Feature or RDD of Features to be labeled. """ if isinstance(x, RDD): return x.map(lambda v: self.predict(v)) @@ -680,35 +721,42 @@ class IsotonicRegression(object): Currently implemented using parallelized pool adjacent violators algorithm. Only univariate (single feature) algorithm supported. - Sequential PAV implementation based on: + .. versionadded:: 1.4.0 + + Notes + ----- + Sequential PAV implementation based on + Tibshirani, Ryan J., Holger Hoefling, and Robert Tibshirani (2011) [1]_ - Tibshirani, Ryan J., Holger Hoefling, and Robert Tibshirani. - "Nearly-isotonic regression." Technometrics 53.1 (2011): 54-61. - Available from http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf + Sequential PAV parallelization based on + Kearsley, Anthony J., Richard A. Tapia, and Michael W. Trosset (1996) [2]_ - Sequential PAV parallelization based on: + See also + `Isotonic regression (Wikipedia) `_. - Kearsley, Anthony J., Richard A. Tapia, and Michael W. Trosset. + .. [1] Tibshirani, Ryan J., Holger Hoefling, and Robert Tibshirani. + "Nearly-isotonic regression." Technometrics 53.1 (2011): 54-61. + Available from http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf + .. [2] Kearsley, Anthony J., Richard A. Tapia, and Michael W. Trosset "An approach to parallelizing isotonic regression." Applied Mathematics and Parallel Computing. Physica-Verlag HD, 1996. 141-147. Available from http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf - - See `Isotonic regression (Wikipedia) `_. - - .. versionadded:: 1.4.0 """ @classmethod - @since("1.4.0") def train(cls, data, isotonic=True): """ Train an isotonic regression model on the given data. - :param data: - RDD of (label, feature, weight) tuples. - :param isotonic: - Whether this is isotonic (which is default) or antitonic. - (default: True) + .. versionadded:: 1.4.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + RDD of (label, feature, weight) tuples. + isotonic : bool, optional + Whether this is isotonic (which is default) or antitonic. + (default: True) """ boundaries, predictions = callMLlibFunc("trainIsotonicRegressionModel", data.map(_convert_to_vector), bool(isotonic)) @@ -741,26 +789,32 @@ def _validate(self, dstream): raise ValueError( "Model must be intialized using setInitialWeights") - @since("1.5.0") def predictOn(self, dstream): """ Use the model to make predictions on batches of data from a DStream. - :return: - DStream containing predictions. + .. versionadded:: 1.5.0 + + Returns + ------- + :py:class:`pyspark.streaming.DStream` + DStream containing predictions. """ self._validate(dstream) return dstream.map(lambda x: self._model.predict(x)) - @since("1.5.0") def predictOnValues(self, dstream): """ Use the model to make predictions on the values of a DStream and carry over its keys. - :return: - DStream containing the input keys and the predictions as values. + .. versionadded:: 1.5.0 + + Returns + ------- + :py:class:`pyspark.streaming.DStream` + DStream containing predictions. """ self._validate(dstream) return dstream.mapValues(lambda x: self._model.predict(x)) @@ -779,20 +833,22 @@ class StreamingLinearRegressionWithSGD(StreamingLinearAlgorithm): of features must be constant. An initial weight vector must be provided. - :param stepSize: - Step size for each iteration of gradient descent. - (default: 0.1) - :param numIterations: - Number of iterations run for each batch of data. - (default: 50) - :param miniBatchFraction: - Fraction of each batch of data to use for updates. - (default: 1.0) - :param convergenceTol: - Value used to determine when to terminate iterations. - (default: 0.001) - .. versionadded:: 1.5.0 + + Parameters + ---------- + stepSize : float, optional + Step size for each iteration of gradient descent. + (default: 0.1) + numIterations : int, optional + Number of iterations run for each batch of data. + (default: 50) + miniBatchFraction : float, optional + Fraction of each batch of data to use for updates. + (default: 1.0) + convergenceTol : float, optional + Value used to determine when to terminate iterations. + (default: 0.001) """ def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, convergenceTol=0.001): self.stepSize = stepSize diff --git a/python/pyspark/mllib/stat/KernelDensity.py b/python/pyspark/mllib/stat/KernelDensity.py index 56444c152f0ba..1d4d43e53519c 100644 --- a/python/pyspark/mllib/stat/KernelDensity.py +++ b/python/pyspark/mllib/stat/KernelDensity.py @@ -26,6 +26,8 @@ class KernelDensity(object): Estimate probability density at required points given an RDD of samples from the population. + Examples + -------- >>> kd = KernelDensity() >>> sample = sc.parallelize([0.0, 1.0]) >>> kd.setSample(sample) diff --git a/python/pyspark/mllib/stat/__init__.py b/python/pyspark/mllib/stat/__init__.py index 0fb33061838af..d3b4ddf7e4c68 100644 --- a/python/pyspark/mllib/stat/__init__.py +++ b/python/pyspark/mllib/stat/__init__.py @@ -21,8 +21,9 @@ from pyspark.mllib.stat._statistics import Statistics, MultivariateStatisticalSummary from pyspark.mllib.stat.distribution import MultivariateGaussian -from pyspark.mllib.stat.test import ChiSqTestResult +from pyspark.mllib.stat.test import ChiSqTestResult, KolmogorovSmirnovTestResult from pyspark.mllib.stat.KernelDensity import KernelDensity -__all__ = ["Statistics", "MultivariateStatisticalSummary", "ChiSqTestResult", +__all__ = ["Statistics", "MultivariateStatisticalSummary", + "ChiSqTestResult", "KolmogorovSmirnovTestResult", "MultivariateGaussian", "KernelDensity"] diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py index 43454ba5187dd..a4b45cf55febe 100644 --- a/python/pyspark/mllib/stat/_statistics.py +++ b/python/pyspark/mllib/stat/_statistics.py @@ -65,11 +65,19 @@ def colStats(rdd): """ Computes column-wise summary statistics for the input RDD[Vector]. - :param rdd: an RDD[Vector] for which column-wise summary statistics - are to be computed. - :return: :class:`MultivariateStatisticalSummary` object containing - column-wise summary statistics. - + Parameters + ---------- + rdd : :py:class:`pyspark.RDD` + an RDD[Vector] for which column-wise summary statistics + are to be computed. + + Returns + ------- + :class:`MultivariateStatisticalSummary` + object containing column-wise summary statistics. + + Examples + -------- >>> from pyspark.mllib.linalg import Vectors >>> rdd = sc.parallelize([Vectors.dense([2, 0, 0, -2]), ... Vectors.dense([4, 5, 0, 3]), @@ -103,13 +111,24 @@ def corr(x, y=None, method=None): to specify the method to be used for single RDD inout. If two RDDs of floats are passed in, a single float is returned. - :param x: an RDD of vector for which the correlation matrix is to be computed, - or an RDD of float of the same cardinality as y when y is specified. - :param y: an RDD of float of the same cardinality as x. - :param method: String specifying the method to use for computing correlation. - Supported: `pearson` (default), `spearman` - :return: Correlation matrix comparing columns in x. - + Parameters + ---------- + x : :py:class:`pyspark.RDD` + an RDD of vector for which the correlation matrix is to be computed, + or an RDD of float of the same cardinality as y when y is specified. + y : :py:class:`pyspark.RDD`, optional + an RDD of float of the same cardinality as x. + method : str, optional + String specifying the method to use for computing correlation. + Supported: `pearson` (default), `spearman` + + Returns + ------- + :py:class:`pyspark.mllib.linalg.Matrix` + Correlation matrix comparing columns in x. + + Examples + -------- >>> x = sc.parallelize([1.0, 0.0, -2.0], 2) >>> y = sc.parallelize([4.0, 5.0, 3.0], 2) >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2) @@ -172,20 +191,33 @@ def chiSqTest(observed, expected=None): contingency matrix for which the chi-squared statistic is computed. All label and feature values must be categorical. - .. note:: `observed` cannot contain negative values - - :param observed: it could be a vector containing the observed categorical - counts/relative frequencies, or the contingency matrix - (containing either counts or relative frequencies), - or an RDD of LabeledPoint containing the labeled dataset - with categorical features. Real-valued features will be - treated as categorical for each distinct value. - :param expected: Vector containing the expected categorical counts/relative - frequencies. `expected` is rescaled if the `expected` sum - differs from the `observed` sum. - :return: ChiSquaredTest object containing the test statistic, degrees - of freedom, p-value, the method used, and the null hypothesis. - + Parameters + ---------- + observed : :py:class:`pyspark.mllib.linalg.Vector` or \ + :py:class:`pyspark.mllib.linalg.Matrix` + it could be a vector containing the observed categorical + counts/relative frequencies, or the contingency matrix + (containing either counts or relative frequencies), + or an RDD of LabeledPoint containing the labeled dataset + with categorical features. Real-valued features will be + treated as categorical for each distinct value. + expected : :py:class:`pyspark.mllib.linalg.Vector` + Vector containing the expected categorical counts/relative + frequencies. `expected` is rescaled if the `expected` sum + differs from the `observed` sum. + + Returns + ------- + :py:class:`pyspark.mllib.stat.ChiSqTestResult` + object containing the test statistic, degrees + of freedom, p-value, the method used, and the null hypothesis. + + Notes + ----- + `observed` cannot contain negative values + + Examples + -------- >>> from pyspark.mllib.linalg import Vectors, Matrices >>> observed = Vectors.dense([4, 6, 5]) >>> pearson = Statistics.chiSqTest(observed) @@ -259,17 +291,28 @@ def kolmogorovSmirnovTest(data, distName="norm", *params): For specific details of the implementation, please have a look at the Scala documentation. - :param data: RDD, samples from the data - :param distName: string, currently only "norm" is supported. - (Normal distribution) to calculate the - theoretical distribution of the data. - :param params: additional values which need to be provided for - a certain distribution. - If not provided, the default values are used. - :return: KolmogorovSmirnovTestResult object containing the test - statistic, degrees of freedom, p-value, - the method used, and the null hypothesis. + Parameters + ---------- + data : :py:class:`pyspark.RDD` + RDD, samples from the data + distName : str, optional + string, currently only "norm" is supported. + (Normal distribution) to calculate the + theoretical distribution of the data. + params + additional values which need to be provided for + a certain distribution. + If not provided, the default values are used. + + Returns + ------- + :py:class:`pyspark.mllib.stat.KolmogorovSmirnovTestResult` + object containing the test statistic, degrees of freedom, p-value, + the method used, and the null hypothesis. + + Examples + -------- >>> kstest = Statistics.kolmogorovSmirnovTest >>> data = sc.parallelize([-1.0, 0.0, 1.0]) >>> ksmodel = kstest(data, "norm") diff --git a/python/pyspark/mllib/stat/distribution.py b/python/pyspark/mllib/stat/distribution.py index 46f7a1d2f277a..aa35ac6dfdae1 100644 --- a/python/pyspark/mllib/stat/distribution.py +++ b/python/pyspark/mllib/stat/distribution.py @@ -24,6 +24,8 @@ class MultivariateGaussian(namedtuple('MultivariateGaussian', ['mu', 'sigma'])): """Represents a (mu, sigma) tuple + Examples + -------- >>> m = MultivariateGaussian(Vectors.dense([11,12]),DenseMatrix(2, 2, (1.0, 3.0, 5.0, 2.0))) >>> (m.mu, m.sigma.toArray()) (DenseVector([11.0, 12.0]), array([[ 1., 5.],[ 3., 2.]])) diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index e05dfdb953ceb..493dcf8db6fd2 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -33,15 +33,18 @@ class TreeEnsembleModel(JavaModelWrapper, JavaSaveable): .. versionadded:: 1.3.0 """ - @since("1.3.0") def predict(self, x): """ Predict values for a single data point or an RDD of points using the model trained. - .. note:: In Python, predict cannot currently be used within an RDD - transformation or action. - Call predict directly on the RDD instead. + .. versionadded:: 1.3.0 + + Notes + ----- + In Python, predict cannot currently be used within an RDD + transformation or action. + Call predict directly on the RDD instead. """ if isinstance(x, RDD): return self.call("predict", x.map(_convert_to_vector)) @@ -79,18 +82,23 @@ class DecisionTreeModel(JavaModelWrapper, JavaSaveable, JavaLoader): .. versionadded:: 1.1.0 """ - @since("1.1.0") def predict(self, x): """ Predict the label of one or more examples. - .. note:: In Python, predict cannot currently be used within an RDD - transformation or action. - Call predict directly on the RDD instead. + .. versionadded:: 1.1.0 + + Parameters + ---------- + x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD` + Data point (feature vector), or an RDD of data points (feature + vectors). - :param x: - Data point (feature vector), or an RDD of data points (feature - vectors). + Notes + ----- + In Python, predict cannot currently be used within an RDD + transformation or action. + Call predict directly on the RDD instead. """ if isinstance(x, RDD): return self.call("predict", x.map(_convert_to_vector)) @@ -143,45 +151,50 @@ def _train(cls, data, type, numClasses, features, impurity="gini", maxDepth=5, m return DecisionTreeModel(model) @classmethod - @since("1.1.0") def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0): """ Train a decision tree model for classification. - :param data: - Training data: RDD of LabeledPoint. Labels should take values - {0, 1, ..., numClasses-1}. - :param numClasses: - Number of classes for classification. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param impurity: - Criterion used for information gain calculation. - Supported values: "gini" or "entropy". - (default: "gini") - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 5) - :param maxBins: - Number of bins used for finding splits at each node. - (default: 32) - :param minInstancesPerNode: - Minimum number of instances required at child nodes to create - the parent split. - (default: 1) - :param minInfoGain: - Minimum info gain required to create a split. - (default: 0.0) - :return: - DecisionTreeModel. - - Example usage: - + .. versionadded:: 1.1.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + Training data: RDD of LabeledPoint. Labels should take values + {0, 1, ..., numClasses-1}. + numClasses : int + Number of classes for classification. + categoricalFeaturesInfo : dict + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. + impurity : str, optional + Criterion used for information gain calculation. + Supported values: "gini" or "entropy". + (default: "gini") + maxDepth : int, optional + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). + (default: 5) + maxBins : int, optional + Number of bins used for finding splits at each node. + (default: 32) + minInstancesPerNode : int, optional + Minimum number of instances required at child nodes to create + the parent split. + (default: 1) + minInfoGain : float, optional + Minimum info gain required to create a split. + (default: 0.0) + + Returns + ------- + :py:class:`DecisionTreeModel` + + Examples + -------- >>> from numpy import array >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import DecisionTree @@ -222,35 +235,39 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, """ Train a decision tree model for regression. - :param data: - Training data: RDD of LabeledPoint. Labels are real numbers. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param impurity: - Criterion used for information gain calculation. - The only supported value for regression is "variance". - (default: "variance") - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 5) - :param maxBins: - Number of bins used for finding splits at each node. - (default: 32) - :param minInstancesPerNode: - Minimum number of instances required at child nodes to create - the parent split. - (default: 1) - :param minInfoGain: - Minimum info gain required to create a split. - (default: 0.0) - :return: - DecisionTreeModel. - - Example usage: - + Parameters + ---------- + data : :py:class:`pyspark.RDD` + Training data: RDD of LabeledPoint. Labels are real numbers. + categoricalFeaturesInfo : dict + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. + impurity : str, optional + Criterion used for information gain calculation. + The only supported value for regression is "variance". + (default: "variance") + maxDepth : int, optional + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). + (default: 5) + maxBins : int, optional + Number of bins used for finding splits at each node. + (default: 32) + minInstancesPerNode : int, optional + Minimum number of instances required at child nodes to create + the parent split. + (default: 1) + minInfoGain : float, optional + Minimum info gain required to create a split. + (default: 0.0) + + Returns + ------- + :py:class:`DecisionTreeModel` + + Examples + -------- >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import DecisionTree >>> from pyspark.mllib.linalg import SparseVector @@ -313,7 +330,6 @@ def _train(cls, data, algo, numClasses, categoricalFeaturesInfo, numTrees, return RandomForestModel(model) @classmethod - @since("1.2.0") def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32, seed=None): @@ -321,44 +337,51 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, Train a random forest model for binary or multiclass classification. - :param data: - Training dataset: RDD of LabeledPoint. Labels should take values - {0, 1, ..., numClasses-1}. - :param numClasses: - Number of classes for classification. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param numTrees: - Number of trees in the random forest. - :param featureSubsetStrategy: - Number of features to consider for splits at each node. - Supported values: "auto", "all", "sqrt", "log2", "onethird". - If "auto" is set, this parameter is set based on numTrees: - if numTrees == 1, set to "all"; - if numTrees > 1 (forest) set to "sqrt". - (default: "auto") - :param impurity: - Criterion used for information gain calculation. - Supported values: "gini" or "entropy". - (default: "gini") - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 4) - :param maxBins: - Maximum number of bins used for splitting features. - (default: 32) - :param seed: - Random seed for bootstrapping and choosing feature subsets. - Set as None to generate seed based on system time. - (default: None) - :return: - RandomForestModel that can be used for prediction. - - Example usage: - + .. versionadded:: 1.2.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + Training dataset: RDD of LabeledPoint. Labels should take values + {0, 1, ..., numClasses-1}. + numClasses : int + Number of classes for classification. + categoricalFeaturesInfo : dict + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. + numTrees : int + Number of trees in the random forest. + featureSubsetStrategy : str, optional + Number of features to consider for splits at each node. + Supported values: "auto", "all", "sqrt", "log2", "onethird". + If "auto" is set, this parameter is set based on numTrees: + if numTrees == 1, set to "all"; + if numTrees > 1 (forest) set to "sqrt". + (default: "auto") + impurity : str, optional + Criterion used for information gain calculation. + Supported values: "gini" or "entropy". + (default: "gini") + maxDepth : int, optional + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). + (default: 4) + maxBins : int, optional + Maximum number of bins used for splitting features. + (default: 32) + seed : int, Optional + Random seed for bootstrapping and choosing feature subsets. + Set as None to generate seed based on system time. + (default: None) + + Returns + ------- + :py:class:`RandomForestModel` + that can be used for prediction. + + Examples + -------- >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import RandomForest >>> @@ -405,47 +428,55 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, maxDepth, maxBins, seed) @classmethod - @since("1.2.0") def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32, seed=None): """ Train a random forest model for regression. - :param data: - Training dataset: RDD of LabeledPoint. Labels are real numbers. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param numTrees: - Number of trees in the random forest. - :param featureSubsetStrategy: - Number of features to consider for splits at each node. - Supported values: "auto", "all", "sqrt", "log2", "onethird". - If "auto" is set, this parameter is set based on numTrees: - if numTrees == 1, set to "all"; - if numTrees > 1 (forest) set to "onethird" for regression. - (default: "auto") - :param impurity: - Criterion used for information gain calculation. - The only supported value for regression is "variance". - (default: "variance") - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 4) - :param maxBins: - Maximum number of bins used for splitting features. - (default: 32) - :param seed: - Random seed for bootstrapping and choosing feature subsets. - Set as None to generate seed based on system time. - (default: None) - :return: - RandomForestModel that can be used for prediction. - - Example usage: - + .. versionadded:: 1.2.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + Training dataset: RDD of LabeledPoint. Labels are real numbers. + categoricalFeaturesInfo : dict + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. + numTrees : int + Number of trees in the random forest. + featureSubsetStrategy : str, optional + Number of features to consider for splits at each node. + Supported values: "auto", "all", "sqrt", "log2", "onethird". + If "auto" is set, this parameter is set based on numTrees: + + - if numTrees == 1, set to "all"; + - if numTrees > 1 (forest) set to "onethird" for regression. + + (default: "auto") + impurity : str, optional + Criterion used for information gain calculation. + The only supported value for regression is "variance". + (default: "variance") + maxDepth : int, optional + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). + (default: 4) + maxBins : int, optional + Maximum number of bins used for splitting features. + (default: 32) + seed : int, optional + Random seed for bootstrapping and choosing feature subsets. + Set as None to generate seed based on system time. + (default: None) + + Returns + ------- + :py:class:`RandomForestModel` + that can be used for prediction. + + Examples + -------- >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import RandomForest >>> from pyspark.mllib.linalg import SparseVector @@ -505,45 +536,51 @@ def _train(cls, data, algo, categoricalFeaturesInfo, return GradientBoostedTreesModel(model) @classmethod - @since("1.3.0") def trainClassifier(cls, data, categoricalFeaturesInfo, loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3, maxBins=32): """ Train a gradient-boosted trees model for classification. - :param data: - Training dataset: RDD of LabeledPoint. Labels should take values - {0, 1}. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param loss: - Loss function used for minimization during gradient boosting. - Supported values: "logLoss", "leastSquaresError", - "leastAbsoluteError". - (default: "logLoss") - :param numIterations: - Number of iterations of boosting. - (default: 100) - :param learningRate: - Learning rate for shrinking the contribution of each estimator. - The learning rate should be between in the interval (0, 1]. - (default: 0.1) - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 3) - :param maxBins: - Maximum number of bins used for splitting features. DecisionTree - requires maxBins >= max categories. - (default: 32) - :return: - GradientBoostedTreesModel that can be used for prediction. - - Example usage: - + .. versionadded:: 1.3.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + Training dataset: RDD of LabeledPoint. Labels should take values + {0, 1}. + categoricalFeaturesInfo : dict + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. + loss : str, optional + Loss function used for minimization during gradient boosting. + Supported values: "logLoss", "leastSquaresError", + "leastAbsoluteError". + (default: "logLoss") + numIterations : int, optional + Number of iterations of boosting. + (default: 100) + learningRate : float, optional + Learning rate for shrinking the contribution of each estimator. + The learning rate should be between in the interval (0, 1]. + (default: 0.1) + maxDepth : int, optional + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). + (default: 3) + maxBins : int, optional + Maximum number of bins used for splitting features. DecisionTree + requires maxBins >= max categories. + (default: 32) + + Returns + ------- + :py:class:`GradientBoostedTreesModel` + that can be used for prediction. + + Examples + -------- >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import GradientBoostedTrees >>> @@ -574,44 +611,50 @@ def trainClassifier(cls, data, categoricalFeaturesInfo, loss, numIterations, learningRate, maxDepth, maxBins) @classmethod - @since("1.3.0") def trainRegressor(cls, data, categoricalFeaturesInfo, loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3, maxBins=32): """ Train a gradient-boosted trees model for regression. - :param data: - Training dataset: RDD of LabeledPoint. Labels are real numbers. - :param categoricalFeaturesInfo: - Map storing arity of categorical features. An entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. - :param loss: - Loss function used for minimization during gradient boosting. - Supported values: "logLoss", "leastSquaresError", - "leastAbsoluteError". - (default: "leastSquaresError") - :param numIterations: - Number of iterations of boosting. - (default: 100) - :param learningRate: - Learning rate for shrinking the contribution of each estimator. - The learning rate should be between in the interval (0, 1]. - (default: 0.1) - :param maxDepth: - Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 - means 1 internal node + 2 leaf nodes). - (default: 3) - :param maxBins: - Maximum number of bins used for splitting features. DecisionTree - requires maxBins >= max categories. - (default: 32) - :return: - GradientBoostedTreesModel that can be used for prediction. - - Example usage: - + .. versionadded:: 1.3.0 + + Parameters + ---------- + data : + Training dataset: RDD of LabeledPoint. Labels are real numbers. + categoricalFeaturesInfo : dict + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. + loss : str, optional + Loss function used for minimization during gradient boosting. + Supported values: "logLoss", "leastSquaresError", + "leastAbsoluteError". + (default: "leastSquaresError") + numIterations : int, optional + Number of iterations of boosting. + (default: 100) + learningRate : float, optional + Learning rate for shrinking the contribution of each estimator. + The learning rate should be between in the interval (0, 1]. + (default: 0.1) + maxDepth : int, optional + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). + (default: 3) + maxBins : int, optional + Maximum number of bins used for splitting features. DecisionTree + requires maxBins >= max categories. + (default: 32) + + Returns + ------- + :py:class:`GradientBoostedTreesModel` + that can be used for prediction. + + Examples + -------- >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import GradientBoostedTrees >>> from pyspark.mllib.linalg import SparseVector diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py index a0be29a82e3dc..68feb9563852c 100644 --- a/python/pyspark/mllib/util.py +++ b/python/pyspark/mllib/util.py @@ -65,7 +65,6 @@ def _convert_labeled_point_to_libsvm(p): return " ".join(items) @staticmethod - @since("1.0.0") def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None): """ Loads labeled data in the LIBSVM format into an RDD of @@ -79,20 +78,33 @@ def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None): method parses each line into a LabeledPoint, where the feature indices are converted to zero-based. - :param sc: Spark context - :param path: file or directory path in any Hadoop-supported file - system URI - :param numFeatures: number of features, which will be determined - from the input data if a nonpositive value - is given. This is useful when the dataset is - already split into multiple files and you - want to load them separately, because some - features may not present in certain files, - which leads to inconsistent feature - dimensions. - :param minPartitions: min number of partitions - :return: labeled data stored as an RDD of LabeledPoint - + .. versionadded:: 1.0.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + Spark context + path : str + file or directory path in any Hadoop-supported file system URI + numFeatures : int, optional + number of features, which will be determined + from the input data if a nonpositive value + is given. This is useful when the dataset is + already split into multiple files and you + want to load them separately, because some + features may not present in certain files, + which leads to inconsistent feature + dimensions. + minPartitions : int, optional + min number of partitions + + Returns + ------- + :py:class:`pyspark.RDD` + labeled data stored as an RDD of LabeledPoint + + Examples + -------- >>> from tempfile import NamedTemporaryFile >>> from pyspark.mllib.util import MLUtils >>> from pyspark.mllib.regression import LabeledPoint @@ -118,14 +130,21 @@ def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None): return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2]))) @staticmethod - @since("1.0.0") def saveAsLibSVMFile(data, dir): """ Save labeled data in LIBSVM format. - :param data: an RDD of LabeledPoint to be saved - :param dir: directory to save the data + .. versionadded:: 1.0.0 + + Parameters + ---------- + data : :py:class:`pyspark.RDD` + an RDD of LabeledPoint to be saved + dir : str + directory to save the data + Examples + -------- >>> from tempfile import NamedTemporaryFile >>> from fileinput import input >>> from pyspark.mllib.regression import LabeledPoint @@ -143,17 +162,28 @@ def saveAsLibSVMFile(data, dir): lines.saveAsTextFile(dir) @staticmethod - @since("1.1.0") def loadLabeledPoints(sc, path, minPartitions=None): """ Load labeled points saved using RDD.saveAsTextFile. - :param sc: Spark context - :param path: file or directory path in any Hadoop-supported file - system URI - :param minPartitions: min number of partitions - :return: labeled data stored as an RDD of LabeledPoint + .. versionadded:: 1.0.0 + + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + Spark context + path : str + file or directory path in any Hadoop-supported file system URI + minPartitions : int, optional + min number of partitions + Returns + ------- + :py:class:`pyspark.RDD` + labeled data stored as an RDD of LabeledPoint + + Examples + -------- >>> from tempfile import NamedTemporaryFile >>> from pyspark.mllib.util import MLUtils >>> from pyspark.mllib.regression import LabeledPoint @@ -193,7 +223,6 @@ def loadVectors(sc, path): return callMLlibFunc("loadVectors", sc, path) @staticmethod - @since("2.0.0") def convertVectorColumnsToML(dataset, *cols): """ Converts vector columns in an input DataFrame from the @@ -201,16 +230,26 @@ def convertVectorColumnsToML(dataset, *cols): :py:class:`pyspark.ml.linalg.Vector` type under the `spark.ml` package. - :param dataset: - input dataset - :param cols: - a list of vector columns to be converted. - New vector columns will be ignored. If unspecified, all old - vector columns will be converted excepted nested ones. - :return: - the input dataset with old vector columns converted to the - new vector type + .. versionadded:: 2.0.0 + + Parameters + ---------- + dataset : :py:class:`pyspark.sql.DataFrame` + input dataset + \\*cols : str + Vector columns to be converted. + New vector columns will be ignored. If unspecified, all old + vector columns will be converted excepted nested ones. + + Returns + ------- + :py:class:`pyspark.sql.DataFrame` + the input dataset with old vector columns converted to the + new vector type + + Examples + -------- >>> import pyspark >>> from pyspark.mllib.linalg import Vectors >>> from pyspark.mllib.util import MLUtils @@ -233,7 +272,6 @@ def convertVectorColumnsToML(dataset, *cols): return callMLlibFunc("convertVectorColumnsToML", dataset, list(cols)) @staticmethod - @since("2.0.0") def convertVectorColumnsFromML(dataset, *cols): """ Converts vector columns in an input DataFrame to the @@ -241,16 +279,26 @@ def convertVectorColumnsFromML(dataset, *cols): :py:class:`pyspark.ml.linalg.Vector` type under the `spark.ml` package. - :param dataset: - input dataset - :param cols: - a list of vector columns to be converted. - Old vector columns will be ignored. If unspecified, all new - vector columns will be converted except nested ones. - :return: - the input dataset with new vector columns converted to the - old vector type + .. versionadded:: 2.0.0 + + Parameters + ---------- + dataset : :py:class:`pyspark.sql.DataFrame` + input dataset + \\*cols : str + Vector columns to be converted. + + Old vector columns will be ignored. If unspecified, all new + vector columns will be converted except nested ones. + + Returns + ------- + :py:class:`pyspark.sql.DataFrame` + the input dataset with new vector columns converted to the + old vector type + Examples + -------- >>> import pyspark >>> from pyspark.ml.linalg import Vectors >>> from pyspark.mllib.util import MLUtils @@ -273,7 +321,6 @@ def convertVectorColumnsFromML(dataset, *cols): return callMLlibFunc("convertVectorColumnsFromML", dataset, list(cols)) @staticmethod - @since("2.0.0") def convertMatrixColumnsToML(dataset, *cols): """ Converts matrix columns in an input DataFrame from the @@ -281,16 +328,26 @@ def convertMatrixColumnsToML(dataset, *cols): :py:class:`pyspark.ml.linalg.Matrix` type under the `spark.ml` package. - :param dataset: - input dataset - :param cols: - a list of matrix columns to be converted. - New matrix columns will be ignored. If unspecified, all old - matrix columns will be converted excepted nested ones. - :return: - the input dataset with old matrix columns converted to the - new matrix type + .. versionadded:: 2.0.0 + Parameters + ---------- + dataset : :py:class:`pyspark.sql.DataFrame` + input dataset + \\*cols : str + Matrix columns to be converted. + + New matrix columns will be ignored. If unspecified, all old + matrix columns will be converted excepted nested ones. + + Returns + ------- + :py:class:`pyspark.sql.DataFrame` + the input dataset with old matrix columns converted to the + new matrix type + + Examples + -------- >>> import pyspark >>> from pyspark.mllib.linalg import Matrices >>> from pyspark.mllib.util import MLUtils @@ -313,7 +370,6 @@ def convertMatrixColumnsToML(dataset, *cols): return callMLlibFunc("convertMatrixColumnsToML", dataset, list(cols)) @staticmethod - @since("2.0.0") def convertMatrixColumnsFromML(dataset, *cols): """ Converts matrix columns in an input DataFrame to the @@ -321,16 +377,26 @@ def convertMatrixColumnsFromML(dataset, *cols): :py:class:`pyspark.ml.linalg.Matrix` type under the `spark.ml` package. - :param dataset: - input dataset - :param cols: - a list of matrix columns to be converted. - Old matrix columns will be ignored. If unspecified, all new - matrix columns will be converted except nested ones. - :return: - the input dataset with new matrix columns converted to the - old matrix type + .. versionadded:: 2.0.0 + + Parameters + ---------- + dataset : :py:class:`pyspark.sql.DataFrame` + input dataset + \\*cols : str + Matrix columns to be converted. + + Old matrix columns will be ignored. If unspecified, all new + matrix columns will be converted except nested ones. + Returns + ------- + :py:class:`pyspark.sql.DataFrame` + the input dataset with new matrix columns converted to the + old matrix type + + Examples + -------- >>> import pyspark >>> from pyspark.ml.linalg import Matrices >>> from pyspark.mllib.util import MLUtils @@ -370,10 +436,14 @@ def save(self, sc, path): The model may be loaded using :py:meth:`Loader.load`. - :param sc: Spark context used to save model data. - :param path: Path specifying the directory in which to save - this model. If the directory already exists, - this method throws an exception. + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + Spark context used to save model data. + path : str + Path specifying the directory in which to save + this model. If the directory already exists, + this method throws an exception. """ raise NotImplementedError @@ -410,10 +480,17 @@ def load(cls, sc, path): Load a model from the given path. The model should have been saved using :py:meth:`Saveable.save`. - :param sc: Spark context used for loading model files. - :param path: Path specifying the directory to which the model - was saved. - :return: model instance + Parameters + ---------- + sc : :py:class:`pyspark.SparkContext` + Spark context used for loading model files. + path : str + Path specifying the directory to which the model was saved. + + Returns + ------- + object + model instance """ raise NotImplementedError @@ -463,20 +540,33 @@ class LinearDataGenerator(object): """ @staticmethod - @since("1.5.0") def generateLinearInput(intercept, weights, xMean, xVariance, nPoints, seed, eps): """ - :param: intercept bias factor, the term c in X'w + c - :param: weights feature vector, the term w in X'w + c - :param: xMean Point around which the data X is centered. - :param: xVariance Variance of the given data - :param: nPoints Number of points to be generated - :param: seed Random Seed - :param: eps Used to scale the noise. If eps is set high, - the amount of gaussian noise added is more. - - Returns a list of LabeledPoints of length nPoints + .. versionadded:: 1.5.0 + + Parameters + ---------- + intercept : float + bias factor, the term c in X'w + c + weights : :py:class:`pyspark.mllib.linalg.Vector` or convertible + feature vector, the term w in X'w + c + xMean : :py:class:`pyspark.mllib.linalg.Vector` or convertible + Point around which the data X is centered. + xVariance : :py:class:`pyspark.mllib.linalg.Vector` or convertible + Variance of the given data + nPoints : int + Number of points to be generated + seed : int + Random Seed + eps : float + Used to scale the noise. If eps is set high, + the amount of gaussian noise added is more. + + Returns + ------- + list + of :py:class:`pyspark.mllib.regression.LabeledPoints` of length nPoints """ weights = [float(weight) for weight in weights] xMean = [float(mean) for mean in xMean] From d1b4f06179f3f7838ae1ce7a6244b2ba75134e41 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Wed, 25 Nov 2020 02:02:32 +0000 Subject: [PATCH 004/150] [SPARK-33494][SQL][AQE] Do not use local shuffle reader for repartition ### What changes were proposed in this pull request? This PR updates `ShuffleExchangeExec` to carry more information about how much we can change the partitioning. For `repartition(col)`, we should preserve the user-specified partitioning and don't apply the AQE local shuffle reader. ### Why are the changes needed? Similar to `repartition(number, col)`, we should respect the user-specified partitioning. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? a new test Closes #30432 from cloud-fan/aqe. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan --- .../spark/sql/execution/SparkStrategies.scala | 14 ++++---- .../adaptive/CoalesceShufflePartitions.scala | 9 +++++- .../adaptive/OptimizeLocalShuffleReader.scala | 11 +++++-- .../exchange/ShuffleExchangeExec.scala | 28 +++++++++++----- .../sql-tests/results/explain-aqe.sql.out | 24 +++++++------- .../sql-tests/results/explain.sql.out | 32 +++++++++---------- .../sql/SparkSessionExtensionSuite.scala | 6 ++-- .../adaptive/AdaptiveQueryExecSuite.scala | 10 ++++++ 8 files changed, 86 insertions(+), 48 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index e9b1aa81895f5..f5f77b03c2b1b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.streaming.{InternalOutputModes, StreamingRe import org.apache.spark.sql.execution.aggregate.AggUtils import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableScanExec} import org.apache.spark.sql.execution.command._ -import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec +import org.apache.spark.sql.execution.exchange.{REPARTITION, REPARTITION_WITH_NUM, ShuffleExchangeExec} import org.apache.spark.sql.execution.python._ import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.sources.MemoryPlan @@ -670,7 +670,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { case logical.Repartition(numPartitions, shuffle, child) => if (shuffle) { ShuffleExchangeExec(RoundRobinPartitioning(numPartitions), - planLater(child), noUserSpecifiedNumPartition = false) :: Nil + planLater(child), REPARTITION_WITH_NUM) :: Nil } else { execution.CoalesceExec(numPartitions, planLater(child)) :: Nil } @@ -703,10 +703,12 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { case r: logical.Range => execution.RangeExec(r) :: Nil case r: logical.RepartitionByExpression => - exchange.ShuffleExchangeExec( - r.partitioning, - planLater(r.child), - noUserSpecifiedNumPartition = r.optNumPartitions.isEmpty) :: Nil + val shuffleOrigin = if (r.optNumPartitions.isEmpty) { + REPARTITION + } else { + REPARTITION_WITH_NUM + } + exchange.ShuffleExchangeExec(r.partitioning, planLater(r.child), shuffleOrigin) :: Nil case ExternalRDD(outputObjAttr, rdd) => ExternalRDDScanExec(outputObjAttr, rdd) :: Nil case r: LogicalRDD => RDDScanExec(r.output, r.rdd, "ExistingRDD", r.outputPartitioning, r.outputOrdering) :: Nil diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala index 89ff528d7a188..0cf3ab0cca49a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala @@ -18,8 +18,10 @@ package org.apache.spark.sql.execution.adaptive import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.plans.physical.SinglePartition import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, REPARTITION, ShuffleExchangeLike} import org.apache.spark.sql.internal.SQLConf /** @@ -47,7 +49,7 @@ case class CoalesceShufflePartitions(session: SparkSession) extends Rule[SparkPl val shuffleStages = collectShuffleStages(plan) // ShuffleExchanges introduced by repartition do not support changing the number of partitions. // We change the number of partitions in the stage only if all the ShuffleExchanges support it. - if (!shuffleStages.forall(_.shuffle.canChangeNumPartitions)) { + if (!shuffleStages.forall(s => supportCoalesce(s.shuffle))) { plan } else { // `ShuffleQueryStageExec#mapStats` returns None when the input RDD has 0 partitions, @@ -82,4 +84,9 @@ case class CoalesceShufflePartitions(session: SparkSession) extends Rule[SparkPl } } } + + private def supportCoalesce(s: ShuffleExchangeLike): Boolean = { + s.outputPartitioning != SinglePartition && + (s.shuffleOrigin == ENSURE_REQUIREMENTS || s.shuffleOrigin == REPARTITION) + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala index 8db2827beaf43..8f57947cb6396 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala @@ -18,9 +18,10 @@ package org.apache.spark.sql.execution.adaptive import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide} +import org.apache.spark.sql.catalyst.plans.physical.SinglePartition import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ShuffleExchangeExec} +import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, EnsureRequirements, ShuffleExchangeExec, ShuffleExchangeLike} import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec import org.apache.spark.sql.internal.SQLConf @@ -136,9 +137,13 @@ object OptimizeLocalShuffleReader extends Rule[SparkPlan] { def canUseLocalShuffleReader(plan: SparkPlan): Boolean = plan match { case s: ShuffleQueryStageExec => - s.shuffle.canChangeNumPartitions && s.mapStats.isDefined + s.mapStats.isDefined && supportLocalReader(s.shuffle) case CustomShuffleReaderExec(s: ShuffleQueryStageExec, partitionSpecs) => - s.shuffle.canChangeNumPartitions && s.mapStats.isDefined && partitionSpecs.nonEmpty + s.mapStats.isDefined && partitionSpecs.nonEmpty && supportLocalReader(s.shuffle) case _ => false } + + private def supportLocalReader(s: ShuffleExchangeLike): Boolean = { + s.outputPartitioning != SinglePartition && s.shuffleOrigin == ENSURE_REQUIREMENTS + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala index 6af4b098bee2f..affa92de693af 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala @@ -57,9 +57,9 @@ trait ShuffleExchangeLike extends Exchange { def numPartitions: Int /** - * Returns whether the shuffle partition number can be changed. + * The origin of this shuffle operator. */ - def canChangeNumPartitions: Boolean + def shuffleOrigin: ShuffleOrigin /** * The asynchronous job that materializes the shuffle. @@ -77,18 +77,30 @@ trait ShuffleExchangeLike extends Exchange { def runtimeStatistics: Statistics } +// Describes where the shuffle operator comes from. +sealed trait ShuffleOrigin + +// Indicates that the shuffle operator was added by the internal `EnsureRequirements` rule. It +// means that the shuffle operator is used to ensure internal data partitioning requirements and +// Spark is free to optimize it as long as the requirements are still ensured. +case object ENSURE_REQUIREMENTS extends ShuffleOrigin + +// Indicates that the shuffle operator was added by the user-specified repartition operator. Spark +// can still optimize it via changing shuffle partition number, as data partitioning won't change. +case object REPARTITION extends ShuffleOrigin + +// Indicates that the shuffle operator was added by the user-specified repartition operator with +// a certain partition number. Spark can't optimize it. +case object REPARTITION_WITH_NUM extends ShuffleOrigin + /** * Performs a shuffle that will result in the desired partitioning. */ case class ShuffleExchangeExec( override val outputPartitioning: Partitioning, child: SparkPlan, - noUserSpecifiedNumPartition: Boolean = true) extends ShuffleExchangeLike { - - // If users specify the num partitions via APIs like `repartition`, we shouldn't change it. - // For `SinglePartition`, it requires exactly one partition and we can't change it either. - override def canChangeNumPartitions: Boolean = - noUserSpecifiedNumPartition && outputPartitioning != SinglePartition + shuffleOrigin: ShuffleOrigin = ENSURE_REQUIREMENTS) + extends ShuffleExchangeLike { private lazy val writeMetrics = SQLShuffleWriteMetricsReporter.createShuffleWriteMetrics(sparkContext) diff --git a/sql/core/src/test/resources/sql-tests/results/explain-aqe.sql.out b/sql/core/src/test/resources/sql-tests/results/explain-aqe.sql.out index 567e0eabe1805..578b0a807fc52 100644 --- a/sql/core/src/test/resources/sql-tests/results/explain-aqe.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/explain-aqe.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 23 +-- Number of queries: 24 -- !query @@ -67,10 +67,10 @@ Aggregate [sum(distinct cast(val#x as bigint)) AS sum(DISTINCT val)#xL] == Physical Plan == AdaptiveSparkPlan isFinalPlan=false +- HashAggregate(keys=[], functions=[sum(distinct cast(val#x as bigint)#xL)], output=[sum(DISTINCT val)#xL]) - +- Exchange SinglePartition, true, [id=#x] + +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [id=#x] +- HashAggregate(keys=[], functions=[partial_sum(distinct cast(val#x as bigint)#xL)], output=[sum#xL]) +- HashAggregate(keys=[cast(val#x as bigint)#xL], functions=[], output=[cast(val#x as bigint)#xL]) - +- Exchange hashpartitioning(cast(val#x as bigint)#xL, 4), true, [id=#x] + +- Exchange hashpartitioning(cast(val#x as bigint)#xL, 4), ENSURE_REQUIREMENTS, [id=#x] +- HashAggregate(keys=[cast(val#x as bigint) AS cast(val#x as bigint)#xL], functions=[], output=[cast(val#x as bigint)#xL]) +- FileScan parquet default.explain_temp1[val#x] Batched: true, DataFilters: [], Format: Parquet, Location [not included in comparison]/{warehouse_dir}/explain_temp1], PartitionFilters: [], PushedFilters: [], ReadSchema: struct @@ -116,7 +116,7 @@ Results [2]: [key#x, max#x] (4) Exchange Input [2]: [key#x, max#x] -Arguments: hashpartitioning(key#x, 4), true, [id=#x] +Arguments: hashpartitioning(key#x, 4), ENSURE_REQUIREMENTS, [id=#x] (5) HashAggregate Input [2]: [key#x, max#x] @@ -127,7 +127,7 @@ Results [2]: [key#x, max(val#x)#x AS max(val)#x] (6) Exchange Input [2]: [key#x, max(val)#x] -Arguments: rangepartitioning(key#x ASC NULLS FIRST, 4), true, [id=#x] +Arguments: rangepartitioning(key#x ASC NULLS FIRST, 4), ENSURE_REQUIREMENTS, [id=#x] (7) Sort Input [2]: [key#x, max(val)#x] @@ -179,7 +179,7 @@ Results [2]: [key#x, max#x] (4) Exchange Input [2]: [key#x, max#x] -Arguments: hashpartitioning(key#x, 4), true, [id=#x] +Arguments: hashpartitioning(key#x, 4), ENSURE_REQUIREMENTS, [id=#x] (5) HashAggregate Input [2]: [key#x, max#x] @@ -254,7 +254,7 @@ Results [2]: [key#x, val#x] (7) Exchange Input [2]: [key#x, val#x] -Arguments: hashpartitioning(key#x, val#x, 4), true, [id=#x] +Arguments: hashpartitioning(key#x, val#x, 4), ENSURE_REQUIREMENTS, [id=#x] (8) HashAggregate Input [2]: [key#x, val#x] @@ -576,7 +576,7 @@ Results [2]: [key#x, max#x] (4) Exchange Input [2]: [key#x, max#x] -Arguments: hashpartitioning(key#x, 4), true, [id=#x] +Arguments: hashpartitioning(key#x, 4), ENSURE_REQUIREMENTS, [id=#x] (5) HashAggregate Input [2]: [key#x, max#x] @@ -605,7 +605,7 @@ Results [2]: [key#x, max#x] (9) Exchange Input [2]: [key#x, max#x] -Arguments: hashpartitioning(key#x, 4), true, [id=#x] +Arguments: hashpartitioning(key#x, 4), ENSURE_REQUIREMENTS, [id=#x] (10) HashAggregate Input [2]: [key#x, max#x] @@ -687,7 +687,7 @@ Results [3]: [count#xL, sum#xL, count#xL] (3) Exchange Input [3]: [count#xL, sum#xL, count#xL] -Arguments: SinglePartition, true, [id=#x] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [id=#x] (4) HashAggregate Input [3]: [count#xL, sum#xL, count#xL] @@ -732,7 +732,7 @@ Results [2]: [key#x, buf#x] (3) Exchange Input [2]: [key#x, buf#x] -Arguments: hashpartitioning(key#x, 4), true, [id=#x] +Arguments: hashpartitioning(key#x, 4), ENSURE_REQUIREMENTS, [id=#x] (4) ObjectHashAggregate Input [2]: [key#x, buf#x] @@ -783,7 +783,7 @@ Results [2]: [key#x, min#x] (4) Exchange Input [2]: [key#x, min#x] -Arguments: hashpartitioning(key#x, 4), true, [id=#x] +Arguments: hashpartitioning(key#x, 4), ENSURE_REQUIREMENTS, [id=#x] (5) Sort Input [2]: [key#x, min#x] diff --git a/sql/core/src/test/resources/sql-tests/results/explain.sql.out b/sql/core/src/test/resources/sql-tests/results/explain.sql.out index fcd69549f2c6e..886b98e538d28 100644 --- a/sql/core/src/test/resources/sql-tests/results/explain.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/explain.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 23 +-- Number of queries: 24 -- !query @@ -66,10 +66,10 @@ Aggregate [sum(distinct cast(val#x as bigint)) AS sum(DISTINCT val)#xL] == Physical Plan == *HashAggregate(keys=[], functions=[sum(distinct cast(val#x as bigint)#xL)], output=[sum(DISTINCT val)#xL]) -+- Exchange SinglePartition, true, [id=#x] ++- Exchange SinglePartition, ENSURE_REQUIREMENTS, [id=#x] +- *HashAggregate(keys=[], functions=[partial_sum(distinct cast(val#x as bigint)#xL)], output=[sum#xL]) +- *HashAggregate(keys=[cast(val#x as bigint)#xL], functions=[], output=[cast(val#x as bigint)#xL]) - +- Exchange hashpartitioning(cast(val#x as bigint)#xL, 4), true, [id=#x] + +- Exchange hashpartitioning(cast(val#x as bigint)#xL, 4), ENSURE_REQUIREMENTS, [id=#x] +- *HashAggregate(keys=[cast(val#x as bigint) AS cast(val#x as bigint)#xL], functions=[], output=[cast(val#x as bigint)#xL]) +- *ColumnarToRow +- FileScan parquet default.explain_temp1[val#x] Batched: true, DataFilters: [], Format: Parquet, Location [not included in comparison]/{warehouse_dir}/explain_temp1], PartitionFilters: [], PushedFilters: [], ReadSchema: struct @@ -119,7 +119,7 @@ Results [2]: [key#x, max#x] (5) Exchange Input [2]: [key#x, max#x] -Arguments: hashpartitioning(key#x, 4), true, [id=#x] +Arguments: hashpartitioning(key#x, 4), ENSURE_REQUIREMENTS, [id=#x] (6) HashAggregate [codegen id : 2] Input [2]: [key#x, max#x] @@ -130,7 +130,7 @@ Results [2]: [key#x, max(val#x)#x AS max(val)#x] (7) Exchange Input [2]: [key#x, max(val)#x] -Arguments: rangepartitioning(key#x ASC NULLS FIRST, 4), true, [id=#x] +Arguments: rangepartitioning(key#x ASC NULLS FIRST, 4), ENSURE_REQUIREMENTS, [id=#x] (8) Sort [codegen id : 3] Input [2]: [key#x, max(val)#x] @@ -181,7 +181,7 @@ Results [2]: [key#x, max#x] (5) Exchange Input [2]: [key#x, max#x] -Arguments: hashpartitioning(key#x, 4), true, [id=#x] +Arguments: hashpartitioning(key#x, 4), ENSURE_REQUIREMENTS, [id=#x] (6) HashAggregate [codegen id : 2] Input [2]: [key#x, max#x] @@ -259,7 +259,7 @@ Results [2]: [key#x, val#x] (9) Exchange Input [2]: [key#x, val#x] -Arguments: hashpartitioning(key#x, val#x, 4), true, [id=#x] +Arguments: hashpartitioning(key#x, val#x, 4), ENSURE_REQUIREMENTS, [id=#x] (10) HashAggregate [codegen id : 4] Input [2]: [key#x, val#x] @@ -452,7 +452,7 @@ Results [1]: [max#x] (9) Exchange Input [1]: [max#x] -Arguments: SinglePartition, true, [id=#x] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [id=#x] (10) HashAggregate [codegen id : 2] Input [1]: [max#x] @@ -498,7 +498,7 @@ Results [1]: [max#x] (16) Exchange Input [1]: [max#x] -Arguments: SinglePartition, true, [id=#x] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [id=#x] (17) HashAggregate [codegen id : 2] Input [1]: [max#x] @@ -580,7 +580,7 @@ Results [1]: [max#x] (9) Exchange Input [1]: [max#x] -Arguments: SinglePartition, true, [id=#x] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [id=#x] (10) HashAggregate [codegen id : 2] Input [1]: [max#x] @@ -626,7 +626,7 @@ Results [2]: [sum#x, count#xL] (16) Exchange Input [2]: [sum#x, count#xL] -Arguments: SinglePartition, true, [id=#x] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [id=#x] (17) HashAggregate [codegen id : 2] Input [2]: [sum#x, count#xL] @@ -690,7 +690,7 @@ Results [2]: [sum#x, count#xL] (7) Exchange Input [2]: [sum#x, count#xL] -Arguments: SinglePartition, true, [id=#x] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [id=#x] (8) HashAggregate [codegen id : 2] Input [2]: [sum#x, count#xL] @@ -810,7 +810,7 @@ Results [2]: [key#x, max#x] (5) Exchange Input [2]: [key#x, max#x] -Arguments: hashpartitioning(key#x, 4), true, [id=#x] +Arguments: hashpartitioning(key#x, 4), ENSURE_REQUIREMENTS, [id=#x] (6) HashAggregate [codegen id : 4] Input [2]: [key#x, max#x] @@ -901,7 +901,7 @@ Results [3]: [count#xL, sum#xL, count#xL] (4) Exchange Input [3]: [count#xL, sum#xL, count#xL] -Arguments: SinglePartition, true, [id=#x] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [id=#x] (5) HashAggregate [codegen id : 2] Input [3]: [count#xL, sum#xL, count#xL] @@ -945,7 +945,7 @@ Results [2]: [key#x, buf#x] (4) Exchange Input [2]: [key#x, buf#x] -Arguments: hashpartitioning(key#x, 4), true, [id=#x] +Arguments: hashpartitioning(key#x, 4), ENSURE_REQUIREMENTS, [id=#x] (5) ObjectHashAggregate Input [2]: [key#x, buf#x] @@ -995,7 +995,7 @@ Results [2]: [key#x, min#x] (5) Exchange Input [2]: [key#x, min#x] -Arguments: hashpartitioning(key#x, 4), true, [id=#x] +Arguments: hashpartitioning(key#x, 4), ENSURE_REQUIREMENTS, [id=#x] (6) Sort [codegen id : 2] Input [2]: [key#x, min#x] diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala index 951b72a863483..12abd31b99e93 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, QueryStageExec} -import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, BroadcastExchangeLike, ShuffleExchangeExec, ShuffleExchangeLike} +import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, BroadcastExchangeLike, ShuffleExchangeExec, ShuffleExchangeLike, ShuffleOrigin} import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.COLUMN_BATCH_SIZE @@ -766,7 +766,9 @@ case class PreRuleReplaceAddWithBrokenVersion() extends Rule[SparkPlan] { case class MyShuffleExchangeExec(delegate: ShuffleExchangeExec) extends ShuffleExchangeLike { override def numMappers: Int = delegate.numMappers override def numPartitions: Int = delegate.numPartitions - override def canChangeNumPartitions: Boolean = delegate.canChangeNumPartitions + override def shuffleOrigin: ShuffleOrigin = { + delegate.shuffleOrigin + } override def mapOutputStatisticsFuture: Future[MapOutputStatistics] = delegate.mapOutputStatisticsFuture override def getShuffleRDD(partitionSpecs: Array[ShufflePartitionSpec]): RDD[_] = diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index 38a323b1c057e..758965954b374 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -1307,4 +1307,14 @@ class AdaptiveQueryExecSuite spark.listenerManager.unregister(listener) } } + + test("SPARK-33494: Do not use local shuffle reader for repartition") { + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") { + val df = spark.table("testData").repartition('key) + df.collect() + // local shuffle reader breaks partitioning and shouldn't be used for repartition operation + // which is specified by users. + checkNumLocalShuffleReaders(df.queryExecution.executedPlan, numShufflesWithoutLocalReader = 1) + } + } } From b7f034d8dc17b9ae5eced387d20f37b9e3e58901 Mon Sep 17 00:00:00 2001 From: Terry Kim Date: Wed, 25 Nov 2020 03:04:04 +0000 Subject: [PATCH 005/150] [SPARK-33543][SQL] Migrate SHOW COLUMNS command to use UnresolvedTableOrView to resolve the identifier ### What changes were proposed in this pull request? This PR proposes to migrate `SHOW COLUMNS` to use `UnresolvedTableOrView` to resolve the table/view identifier. This allows consistent resolution rules (temp view first, etc.) to be applied for both v1/v2 commands. More info about the consistent resolution rule proposal can be found in [JIRA](https://issues.apache.org/jira/browse/SPARK-29900) or [proposal doc](https://docs.google.com/document/d/1hvLjGA8y_W_hhilpngXVub1Ebv8RsMap986nENCFnrg/edit?usp=sharing). Note that `SHOW COLUMNS` is not yet supported for v2 tables. ### Why are the changes needed? To use `UnresolvedTableOrView` for table/view resolution. Note that `ShowColumnsCommand` internally resolves to a temp view first, so there is no resolution behavior change with this PR. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Updated existing tests. Closes #30490 from imback82/show_columns. Authored-by: Terry Kim Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/parser/AstBuilder.scala | 13 ++++++++++--- .../catalyst/plans/logical/statements.scala | 7 ------- .../catalyst/plans/logical/v2Commands.scala | 10 ++++++++++ .../sql/catalyst/parser/DDLParserSuite.scala | 8 ++++---- .../analysis/ResolveSessionCatalog.scala | 18 +++--------------- .../datasources/v2/DataSourceV2Strategy.scala | 3 +++ .../sql-tests/results/show_columns.sql.out | 16 ++++++++-------- .../sql/connector/DataSourceV2SQLSuite.scala | 11 +++-------- .../spark/sql/execution/command/DDLSuite.scala | 11 +++++++++++ 9 files changed, 52 insertions(+), 45 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index a4298abd211b3..5f8394c525949 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -3400,7 +3400,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg /** * A command for users to list the column names for a table. - * This function creates a [[ShowColumnsStatement]] logical plan. + * This function creates a [[ShowColumns]] logical plan. * * The syntax of using this command in SQL is: * {{{ @@ -3409,9 +3409,16 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg * }}} */ override def visitShowColumns(ctx: ShowColumnsContext): LogicalPlan = withOrigin(ctx) { - val table = visitMultipartIdentifier(ctx.table) + val nameParts = visitMultipartIdentifier(ctx.table) val namespace = Option(ctx.ns).map(visitMultipartIdentifier) - ShowColumnsStatement(table, namespace) + // Use namespace only if table name doesn't specify it. If namespace is already specified + // in the table name, it's checked against the given namespace after table/view is resolved. + val tableName = if (namespace.isDefined && nameParts.length == 1) { + namespace.get ++ nameParts + } else { + nameParts + } + ShowColumns(UnresolvedTableOrView(tableName), namespace) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala index 39bc5a5604b20..3660e8a95a7f6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala @@ -359,13 +359,6 @@ case class ShowPartitionsStatement( tableName: Seq[String], partitionSpec: Option[TablePartitionSpec]) extends ParsedStatement -/** - * A SHOW COLUMNS statement, as parsed from SQL - */ -case class ShowColumnsStatement( - table: Seq[String], - namespace: Option[Seq[String]]) extends ParsedStatement - /** * A SHOW CURRENT NAMESPACE statement, as parsed from SQL */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala index a65b9fc59bd55..ebf41f6a6e304 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala @@ -671,6 +671,15 @@ case class ShowCreateTable(child: LogicalPlan, asSerde: Boolean = false) extends override def children: Seq[LogicalPlan] = child :: Nil } +/** + * The logical plan of the SHOW COLUMN command. + */ +case class ShowColumns( + child: LogicalPlan, + namespace: Option[Seq[String]]) extends Command { + override def children: Seq[LogicalPlan] = child :: Nil +} + /** * The logical plan of the TRUNCATE TABLE command. */ @@ -679,3 +688,4 @@ case class TruncateTable( partitionSpec: Option[TablePartitionSpec]) extends Command { override def children: Seq[LogicalPlan] = child :: Nil } + diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala index 997c642276bfb..cc3c824befb3e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala @@ -1643,13 +1643,13 @@ class DDLParserSuite extends AnalysisTest { val sql4 = "SHOW COLUMNS FROM db1.t1 IN db1" val parsed1 = parsePlan(sql1) - val expected1 = ShowColumnsStatement(Seq("t1"), None) + val expected1 = ShowColumns(UnresolvedTableOrView(Seq("t1")), None) val parsed2 = parsePlan(sql2) - val expected2 = ShowColumnsStatement(Seq("db1", "t1"), None) + val expected2 = ShowColumns(UnresolvedTableOrView(Seq("db1", "t1")), None) val parsed3 = parsePlan(sql3) - val expected3 = ShowColumnsStatement(Seq("t1"), Some(Seq("db1"))) + val expected3 = ShowColumns(UnresolvedTableOrView(Seq("db1", "t1")), Some(Seq("db1"))) val parsed4 = parsePlan(sql4) - val expected4 = ShowColumnsStatement(Seq("db1", "t1"), Some(Seq("db1"))) + val expected4 = ShowColumns(UnresolvedTableOrView(Seq("db1", "t1")), Some(Seq("db1"))) comparePlans(parsed1, expected1) comparePlans(parsed2, expected2) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index 726099991a897..395f5efd5a52d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -467,25 +467,13 @@ class ResolveSessionCatalog( v1TableName.asTableIdentifier, partitionSpec) - case ShowColumnsStatement(tbl, ns) => - if (ns.isDefined && ns.get.length > 1) { - throw new AnalysisException( - s"Namespace name should have only one part if specified: ${ns.get.quoted}") - } - // Use namespace only if table name doesn't specify it. If namespace is already specified - // in the table name, it's checked against the given namespace below. - val nameParts = if (ns.isDefined && tbl.length == 1) { - ns.get ++ tbl - } else { - tbl - } - val sql = "SHOW COLUMNS" - val v1TableName = parseTempViewOrV1Table(nameParts, sql).asTableIdentifier + case ShowColumns(ResolvedV1TableOrViewIdentifier(ident), ns) => + val v1TableName = ident.asTableIdentifier val resolver = conf.resolver val db = ns match { case Some(db) if v1TableName.database.exists(!resolver(_, db.head)) => throw new AnalysisException( - s"SHOW COLUMNS with conflicting databases: " + + "SHOW COLUMNS with conflicting databases: " + s"'${db.head}' != '${v1TableName.database.get}'") case _ => ns.map(_.head) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index 30d976524bfa8..eb0d7010041b9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -305,6 +305,9 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat case TruncateTable(_: ResolvedTable, _) => throw new AnalysisException("TRUNCATE TABLE is not supported for v2 tables.") + case ShowColumns(_: ResolvedTable, _) => + throw new AnalysisException("SHOW COLUMNS is not supported for v2 tables.") + case _ => Nil } } diff --git a/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out b/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out index 4f5db7f6c6b2f..6ddffb89987d8 100644 --- a/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out @@ -93,8 +93,8 @@ SHOW COLUMNS IN badtable FROM showdb -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.analysis.NoSuchTableException -Table or view 'badtable' not found in database 'showdb'; +org.apache.spark.sql.AnalysisException +Table or view not found: showdb.badtable; line 1 pos 0 -- !query @@ -129,8 +129,8 @@ SHOW COLUMNS IN showdb.showcolumn3 -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.analysis.NoSuchTableException -Table or view 'showcolumn3' not found in database 'showdb'; +org.apache.spark.sql.AnalysisException +Table or view not found: showdb.showcolumn3; line 1 pos 0 -- !query @@ -138,8 +138,8 @@ SHOW COLUMNS IN showcolumn3 FROM showdb -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.analysis.NoSuchTableException -Table or view 'showcolumn3' not found in database 'showdb'; +org.apache.spark.sql.AnalysisException +Table or view not found: showdb.showcolumn3; line 1 pos 0 -- !query @@ -147,8 +147,8 @@ SHOW COLUMNS IN showcolumn4 -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.analysis.NoSuchTableException -Table or view 'showcolumn4' not found in database 'showdb'; +org.apache.spark.sql.AnalysisException +Table or view not found: showcolumn4; line 1 pos 0 -- !query diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index 9a3fa0c5bd3f4..222fa8ace4dca 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -2047,14 +2047,9 @@ class DataSourceV2SQLSuite withTable(t) { spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo") - testV1CommandSupportingTempView("SHOW COLUMNS", s"FROM $t") - testV1CommandSupportingTempView("SHOW COLUMNS", s"IN $t") - - val e3 = intercept[AnalysisException] { - sql(s"SHOW COLUMNS FROM tbl IN testcat.ns1.ns2") - } - assert(e3.message.contains("Namespace name should have " + - "only one part if specified: testcat.ns1.ns2")) + testNotSupportedV2Command("SHOW COLUMNS", s"FROM $t") + testNotSupportedV2Command("SHOW COLUMNS", s"IN $t") + testNotSupportedV2Command("SHOW COLUMNS", "FROM tbl IN testcat.ns1.ns2") } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index 07201f9f85b5d..4f79e71419a10 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -2266,6 +2266,17 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { } } + test("show columns - invalid db name") { + withTable("tbl") { + sql("CREATE TABLE tbl(col1 int, col2 string) USING parquet ") + val message = intercept[AnalysisException] { + sql("SHOW COLUMNS IN tbl FROM a.b.c") + }.getMessage + assert(message.contains( + "The namespace in session catalog must have exactly one name part: a.b.c.tbl")) + } + } + test("SPARK-18009 calling toLocalIterator on commands") { import scala.collection.JavaConverters._ val df = sql("show databases") From edab094dda3d5acbc100d01bd98e0ab15d7b4178 Mon Sep 17 00:00:00 2001 From: "Jungtaek Lim (HeartSaVioR)" Date: Wed, 25 Nov 2020 13:12:20 +0900 Subject: [PATCH 006/150] [SPARK-33224][SS][WEBUI] Add watermark gap information into SS UI page ### What changes were proposed in this pull request? This PR proposes to add the watermark gap information in SS UI page. Please refer below screenshots to see what we'd like to show in UI. ![Screen Shot 2020-11-19 at 6 56 38 PM](https://user-images.githubusercontent.com/1317309/99669306-3532d080-2ab2-11eb-9a93-03d2c6a54948.png) Please note that this PR doesn't plot the watermark value - knowing the gap between actual wall clock and watermark looks more useful than the absolute value. ### Why are the changes needed? Watermark is the one of major metrics the end users need to track for stateful queries. Watermark defines "when" the output will be emitted for append mode, hence knowing how much gap between wall clock and watermark (input data) is very helpful to make expectation of the output. ### Does this PR introduce _any_ user-facing change? Yes, SS UI query page will contain the watermark gap information. ### How was this patch tested? Basic UT added. Manually tested with two queries: > simple case You'll see consistent watermark gap with (15 seconds + a) = 10 seconds are from delay in watermark definition, 5 seconds are trigger interval. ``` import org.apache.spark.sql.streaming.Trigger spark.conf.set("spark.sql.shuffle.partitions", "10") val query = spark .readStream .format("rate") .option("rowsPerSecond", 1000) .option("rampUpTime", "10s") .load() .selectExpr("timestamp", "mod(value, 100) as mod", "value") .withWatermark("timestamp", "10 seconds") .groupBy(window($"timestamp", "1 minute", "10 seconds"), $"mod") .agg(max("value").as("max_value"), min("value").as("min_value"), avg("value").as("avg_value")) .writeStream .format("console") .trigger(Trigger.ProcessingTime("5 seconds")) .outputMode("append") .start() query.awaitTermination() ``` ![Screen Shot 2020-11-19 at 7 00 21 PM](https://user-images.githubusercontent.com/1317309/99669049-dbcaa180-2ab1-11eb-8789-10b35857dda0.png) > complicated case This randomizes the timestamp, hence producing random watermark gap. This won't be smaller than 15 seconds as I described earlier. ``` import org.apache.spark.sql.streaming.Trigger spark.conf.set("spark.sql.shuffle.partitions", "10") val query = spark .readStream .format("rate") .option("rowsPerSecond", 1000) .option("rampUpTime", "10s") .load() .selectExpr("*", "CAST(CAST(timestamp AS BIGINT) - CAST((RAND() * 100000) AS BIGINT) AS TIMESTAMP) AS tsMod") .selectExpr("tsMod", "mod(value, 100) as mod", "value") .withWatermark("tsMod", "10 seconds") .groupBy(window($"tsMod", "1 minute", "10 seconds"), $"mod") .agg(max("value").as("max_value"), min("value").as("min_value"), avg("value").as("avg_value")) .writeStream .format("console") .trigger(Trigger.ProcessingTime("5 seconds")) .outputMode("append") .start() query.awaitTermination() ``` ![Screen Shot 2020-11-19 at 6 56 47 PM](https://user-images.githubusercontent.com/1317309/99669029-d5d4c080-2ab1-11eb-9c63-d05b3e1ab391.png) Closes #30427 from HeartSaVioR/SPARK-33224. Authored-by: Jungtaek Lim (HeartSaVioR) Signed-off-by: Jungtaek Lim (HeartSaVioR) --- .../ui/StreamingQueryStatisticsPage.scala | 53 +++++++++++++++++++ .../sql/streaming/ui/UISeleniumSuite.scala | 15 ++++-- 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala index 77b1e61d587a7..24709ba470cde 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala @@ -140,6 +140,58 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab)
} + def generateWatermark( + query: StreamingQueryUIData, + minBatchTime: Long, + maxBatchTime: Long, + jsCollector: JsCollector): Seq[Node] = { + // This is made sure on caller side but put it here to be defensive + require(query.lastProgress != null) + if (query.lastProgress.eventTime.containsKey("watermark")) { + val watermarkData = query.recentProgress.flatMap { p => + val batchTimestamp = parseProgressTimestamp(p.timestamp) + val watermarkValue = parseProgressTimestamp(p.eventTime.get("watermark")) + if (watermarkValue > 0L) { + // seconds + Some((batchTimestamp, ((batchTimestamp - watermarkValue) / 1000.0))) + } else { + None + } + } + + if (watermarkData.nonEmpty) { + val maxWatermark = watermarkData.maxBy(_._2)._2 + val graphUIDataForWatermark = + new GraphUIData( + "watermark-gap-timeline", + "watermark-gap-histogram", + watermarkData, + minBatchTime, + maxBatchTime, + 0, + maxWatermark, + "seconds") + graphUIDataForWatermark.generateDataJs(jsCollector) + + // scalastyle:off + + +
+
Global Watermark Gap {SparkUIUtils.tooltip("The gap between batch timestamp and global watermark for the batch.", "right")}
+
+ + {graphUIDataForWatermark.generateTimelineHtml(jsCollector)} + {graphUIDataForWatermark.generateHistogramHtml(jsCollector)} + + // scalastyle:on + } else { + Seq.empty[Node] + } + } else { + Seq.empty[Node] + } + } + def generateAggregatedStateOperators( query: StreamingQueryUIData, minBatchTime: Long, @@ -465,6 +517,7 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab) {graphUIDataForDuration.generateAreaStackHtmlWithData(jsCollector, operationDurationData)} + {generateWatermark(query, minBatchTime, maxBatchTime, jsCollector)} {generateAggregatedStateOperators(query, minBatchTime, maxBatchTime, jsCollector)} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/UISeleniumSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/UISeleniumSuite.scala index 94844c4e87a84..db3d6529c9906 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/UISeleniumSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/UISeleniumSuite.scala @@ -31,8 +31,10 @@ import org.apache.spark.internal.config.UI.{UI_ENABLED, UI_PORT} import org.apache.spark.sql.LocalSparkSession.withSparkSession import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.util.quietly +import org.apache.spark.sql.functions.{window => windowFn, _} +import org.apache.spark.sql.internal.SQLConf.SHUFFLE_PARTITIONS import org.apache.spark.sql.internal.StaticSQLConf.ENABLED_STREAMING_UI_CUSTOM_METRIC_LIST -import org.apache.spark.sql.streaming.StreamingQueryException +import org.apache.spark.sql.streaming.{StreamingQueryException, Trigger} import org.apache.spark.ui.SparkUICssErrorHandler class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with BeforeAndAfterAll { @@ -52,6 +54,7 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B val conf = new SparkConf() .setMaster(master) .setAppName("ui-test") + .set(SHUFFLE_PARTITIONS, 5) .set(UI_ENABLED, true) .set(UI_PORT, 0) .set(ENABLED_STREAMING_UI_CUSTOM_METRIC_LIST, Seq("stateOnCurrentVersionSizeBytes")) @@ -79,10 +82,15 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B val input1 = spark.readStream.format("rate").load() val input2 = spark.readStream.format("rate").load() + val input3 = spark.readStream.format("rate").load() val activeQuery = - input1.join(input2, "value").writeStream.format("noop").start() + input1.selectExpr("timestamp", "mod(value, 100) as mod", "value") + .withWatermark("timestamp", "0 second") + .groupBy(windowFn($"timestamp", "10 seconds", "2 seconds"), $"mod") + .agg(avg("value").as("avg_value")) + .writeStream.format("noop").trigger(Trigger.ProcessingTime("5 seconds")).start() val completedQuery = - input1.join(input2, "value").writeStream.format("noop").start() + input2.join(input3, "value").writeStream.format("noop").start() completedQuery.stop() val failedQuery = spark.readStream.format("rate").load().select("value").as[Long] .map(_ / 0).writeStream.format("noop").start() @@ -138,6 +146,7 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B summaryText should contain ("Input Rows (?)") summaryText should contain ("Batch Duration (?)") summaryText should contain ("Operation Duration (?)") + summaryText should contain ("Global Watermark Gap (?)") summaryText should contain ("Aggregated Number Of Total State Rows (?)") summaryText should contain ("Aggregated Number Of Updated State Rows (?)") summaryText should contain ("Aggregated State Memory Used In Bytes (?)") From c3ce9701b458511255072c72b9b245036fa98653 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Tue, 24 Nov 2020 20:18:45 -0800 Subject: [PATCH 007/150] [SPARK-33533][SQL] Fix the regression bug that ConnectionProviders don't consider case-sensitivity for properties ### What changes were proposed in this pull request? This PR fixes an issue that `BasicConnectionProvider` doesn't consider case-sensitivity for properties. For example, the property `oracle.jdbc.mapDateToTimestamp` should be considered case-sensitivity but it is not considered. ### Why are the changes needed? This is a bug introduced by #29024 . Caused by this issue, `OracleIntegrationSuite` doesn't pass. ``` [info] - SPARK-16625: General data types to be mapped to Oracle *** FAILED *** (32 seconds, 129 milliseconds) [info] types.apply(9).equals(org.apache.spark.sql.types.DateType) was false (OracleIntegrationSuite.scala:238) [info] org.scalatest.exceptions.TestFailedException: [info] at org.scalatest.Assertions.newAssertionFailedException(Assertions.scala:472) [info] at org.scalatest.Assertions.newAssertionFailedException$(Assertions.scala:471) [info] at org.scalatest.Assertions$.newAssertionFailedException(Assertions.scala:1231) [info] at org.scalatest.Assertions$AssertionsHelper.macroAssert(Assertions.scala:1295) [info] at org.apache.spark.sql.jdbc.OracleIntegrationSuite.$anonfun$new$4(OracleIntegrationSuite.scala:238) [info] at org.scalatest.OutcomeOf.outcomeOf(OutcomeOf.scala:85) [info] at org.scalatest.OutcomeOf.outcomeOf$(OutcomeOf.scala:83) [info] at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104) [info] at org.scalatest.Transformer.apply(Transformer.scala:22) [info] at org.scalatest.Transformer.apply(Transformer.scala:20) [info] at org.scalatest.funsuite.AnyFunSuiteLike$$anon$1.apply(AnyFunSuiteLike.scala:190) [info] at org.apache.spark.SparkFunSuite.withFixture(SparkFunSuite.scala:176) [info] at org.scalatest.funsuite.AnyFunSuiteLike.invokeWithFixture$1(AnyFunSuiteLike.scala:188) [info] at org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$runTest$1(AnyFunSuiteLike.scala:200) [info] at org.scalatest.SuperEngine.runTestImpl(Engine.scala:306) [info] at org.scalatest.funsuite.AnyFunSuiteLike.runTest(AnyFunSuiteLike.scala:200) [info] at org.scalatest.funsuite.AnyFunSuiteLike.runTest$(AnyFunSuiteLike.scala:182) [info] at org.apache.spark.SparkFunSuite.org$scalatest$BeforeAndAfterEach$$super$runTest(SparkFunSuite.scala:61) [info] at org.scalatest.BeforeAndAfterEach.runTest(BeforeAndAfterEach.scala:234) [info] at org.scalatest.BeforeAndAfterEach.runTest$(BeforeAndAfterEach.scala:227) [info] at org.apache.spark.SparkFunSuite.runTest(SparkFunSuite.scala:61) [info] at org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$runTests$1(AnyFunSuiteLike.scala:233) [info] at org.scalatest.SuperEngine.$anonfun$runTestsInBranch$1(Engine.scala:413) [info] at scala.collection.immutable.List.foreach(List.scala:392) [info] at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:401) [info] at org.scalatest.SuperEngine.runTestsInBranch(Engine.scala:396) [info] at org.scalatest.SuperEngine.runTestsImpl(Engine.scala:475) [info] at org.scalatest.funsuite.AnyFunSuiteLike.runTests(AnyFunSuiteLike.scala:233) [info] at org.scalatest.funsuite.AnyFunSuiteLike.runTests$(AnyFunSuiteLike.scala:232) [info] at org.scalatest.funsuite.AnyFunSuite.runTests(AnyFunSuite.scala:1563) [info] at org.scalatest.Suite.run(Suite.scala:1112) [info] at org.scalatest.Suite.run$(Suite.scala:1094) [info] at org.scalatest.funsuite.AnyFunSuite.org$scalatest$funsuite$AnyFunSuiteLike$$super$run(AnyFunSuite.scala:1563) [info] at org.scalatest.funsuite.AnyFunSuiteLike.$anonfun$run$1(AnyFunSuiteLike.scala:237) [info] at org.scalatest.SuperEngine.runImpl(Engine.scala:535) [info] at org.scalatest.funsuite.AnyFunSuiteLike.run(AnyFunSuiteLike.scala:237) [info] at org.scalatest.funsuite.AnyFunSuiteLike.run$(AnyFunSuiteLike.scala:236) [info] at org.apache.spark.SparkFunSuite.org$scalatest$BeforeAndAfterAll$$super$run(SparkFunSuite.scala:61) [info] at org.scalatest.BeforeAndAfterAll.liftedTree1$1(BeforeAndAfterAll.scala:213) [info] at org.scalatest.BeforeAndAfterAll.run(BeforeAndAfterAll.scala:210) [info] at org.scalatest.BeforeAndAfterAll.run$(BeforeAndAfterAll.scala:208) [info] at org.apache.spark.SparkFunSuite.run(SparkFunSuite.scala:61) [info] at org.scalatest.tools.Framework.org$scalatest$tools$Framework$$runSuite(Framework.scala:318) [info] at org.scalatest.tools.Framework$ScalaTestTask.execute(Framework.scala:513) [info] at sbt.ForkMain$Run.lambda$runTest$1(ForkMain.java:413) [info] at java.util.concurrent.FutureTask.run(FutureTask.java:266) [info] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [info] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [info] at java.lang.Thread.run(Thread.java:748) ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? With this change, I confirmed that `OracleIntegrationSuite` passes with the following command. ``` $ git clone https://github.com/oracle/docker-images.git $ cd docker-images/OracleDatabase/SingleInstance/dockerfiles $ ./buildDockerImage.sh -v 18.4.0 -x $ ORACLE_DOCKER_IMAGE_NAME=oracle/database:18.4.0-xe build/sbt -Pdocker-integration-tests -Phive -Phive-thriftserver "testOnly org.apache.spark.sql.jdbc.OracleIntegrationSuite" ``` Closes #30485 from sarutak/fix-oracle-integration-suite. Authored-by: Kousuke Saruta Signed-off-by: Dongjoon Hyun --- .../datasources/jdbc/connection/BasicConnectionProvider.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/connection/BasicConnectionProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/connection/BasicConnectionProvider.scala index 1c0513f982a1e..890205f2f6826 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/connection/BasicConnectionProvider.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/connection/BasicConnectionProvider.scala @@ -20,6 +20,8 @@ package org.apache.spark.sql.execution.datasources.jdbc.connection import java.sql.{Connection, Driver} import java.util.Properties +import scala.collection.JavaConverters._ + import org.apache.spark.internal.Logging import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions import org.apache.spark.sql.jdbc.JdbcConnectionProvider @@ -40,7 +42,7 @@ private[jdbc] class BasicConnectionProvider extends JdbcConnectionProvider with override def getConnection(driver: Driver, options: Map[String, String]): Connection = { val jdbcOptions = new JDBCOptions(options) val properties = getAdditionalProperties(jdbcOptions) - options.foreach { case(k, v) => + jdbcOptions.asProperties.asScala.foreach { case(k, v) => properties.put(k, v) } logDebug(s"JDBC connection initiated with URL: ${jdbcOptions.url} and properties: $properties") From 781e19c4d1f376b52e5305078356bf0a58522bcd Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Wed, 25 Nov 2020 16:38:55 +0900 Subject: [PATCH 008/150] [SPARK-33477][SQL] Hive Metastore support filter by date type ### What changes were proposed in this pull request? Hive Metastore supports strings and integral types in filters. It could also support dates. Please see [HIVE-5679](https://github.com/apache/hive/commit/5106bf1c8671740099fca8e1a7d4b37afe97137f) for more details. This pr add support it. ### Why are the changes needed? Improve query performance. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit test. Closes #30408 from wangyum/SPARK-33477. Authored-by: Yuming Wang Signed-off-by: HyukjinKwon --- .../spark/sql/hive/HiveExternalCatalog.scala | 6 +- .../spark/sql/hive/client/HiveClient.scala | 3 +- .../sql/hive/client/HiveClientImpl.scala | 6 +- .../spark/sql/hive/client/HiveShim.scala | 46 ++++++++-- .../spark/sql/hive/client/FiltersSuite.scala | 35 ++++++- .../client/HivePartitionFilteringSuite.scala | 92 +++++++++++++++---- .../spark/sql/hive/client/VersionsSuite.scala | 3 +- 7 files changed, 155 insertions(+), 36 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index 907bb86ad0c1c..54c237f78cb9c 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -40,7 +40,7 @@ import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils._ import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.{PartitioningUtils, SourceOptions} import org.apache.spark.sql.hive.client.HiveClient @@ -1264,11 +1264,13 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat defaultTimeZoneId: String): Seq[CatalogTablePartition] = withClient { val rawTable = getRawTable(db, table) val catalogTable = restoreTableMetadata(rawTable) + val timeZoneId = CaseInsensitiveMap(catalogTable.storage.properties).getOrElse( + DateTimeUtils.TIMEZONE_OPTION, defaultTimeZoneId) val partColNameMap = buildLowerCasePartColNameMap(catalogTable) val clientPrunedPartitions = - client.getPartitionsByFilter(rawTable, predicates).map { part => + client.getPartitionsByFilter(rawTable, predicates, timeZoneId).map { part => part.copy(spec = restorePartitionSpec(part.spec, partColNameMap)) } prunePartitionsByFilter(catalogTable, clientPrunedPartitions, predicates, defaultTimeZoneId) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala index 3ea80eaf6f714..48f3837740933 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala @@ -233,7 +233,8 @@ private[hive] trait HiveClient { /** Returns partitions filtered by predicates for the given table. */ def getPartitionsByFilter( catalogTable: CatalogTable, - predicates: Seq[Expression]): Seq[CatalogTablePartition] + predicates: Seq[Expression], + timeZoneId: String): Seq[CatalogTablePartition] /** Loads a static partition into an existing table. */ def loadPartition( diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index 9bc99b08c2cc8..b2f0867114bae 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -733,9 +733,11 @@ private[hive] class HiveClientImpl( override def getPartitionsByFilter( table: CatalogTable, - predicates: Seq[Expression]): Seq[CatalogTablePartition] = withHiveState { + predicates: Seq[Expression], + timeZoneId: String): Seq[CatalogTablePartition] = withHiveState { val hiveTable = toHiveTable(table, Some(userName)) - val parts = shim.getPartitionsByFilter(client, hiveTable, predicates).map(fromHivePartition) + val parts = shim.getPartitionsByFilter(client, hiveTable, predicates, timeZoneId) + .map(fromHivePartition) HiveCatalogMetrics.incrementFetchedPartitions(parts.length) parts } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala index d989f0154ea95..17a64a67df283 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala @@ -45,9 +45,9 @@ import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.analysis.NoSuchPermanentFunctionException import org.apache.spark.sql.catalyst.catalog.{CatalogFunction, CatalogTablePartition, CatalogUtils, FunctionResource, FunctionResourceType} import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.util.TypeUtils +import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, TypeUtils} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{AtomicType, IntegralType, StringType} +import org.apache.spark.sql.types.{AtomicType, DateType, IntegralType, StringType} import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.Utils @@ -79,7 +79,11 @@ private[client] sealed abstract class Shim { def getAllPartitions(hive: Hive, table: Table): Seq[Partition] - def getPartitionsByFilter(hive: Hive, table: Table, predicates: Seq[Expression]): Seq[Partition] + def getPartitionsByFilter( + hive: Hive, + table: Table, + predicates: Seq[Expression], + timeZoneId: String): Seq[Partition] def getCommandProcessor(token: String, conf: HiveConf): CommandProcessor @@ -349,7 +353,8 @@ private[client] class Shim_v0_12 extends Shim with Logging { override def getPartitionsByFilter( hive: Hive, table: Table, - predicates: Seq[Expression]): Seq[Partition] = { + predicates: Seq[Expression], + timeZoneId: String): Seq[Partition] = { // getPartitionsByFilter() doesn't support binary comparison ops in Hive 0.12. // See HIVE-4888. logDebug("Hive 0.12 doesn't support predicate pushdown to metastore. " + @@ -632,7 +637,9 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { * * Unsupported predicates are skipped. */ - def convertFilters(table: Table, filters: Seq[Expression]): String = { + def convertFilters(table: Table, filters: Seq[Expression], timeZoneId: String): String = { + lazy val dateFormatter = DateFormatter(DateTimeUtils.getZoneId(timeZoneId)) + /** * An extractor that matches all binary comparison operators except null-safe equality. * @@ -650,6 +657,8 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { case Literal(null, _) => None // `null`s can be cast as other types; we want to avoid NPEs. case Literal(value, _: IntegralType) => Some(value.toString) case Literal(value, _: StringType) => Some(quoteStringLiteral(value.toString)) + case Literal(value, _: DateType) => + Some(dateFormatter.format(value.asInstanceOf[Int])) case _ => None } } @@ -700,6 +709,21 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { } } + object ExtractableDateValues { + private lazy val valueToLiteralString: PartialFunction[Any, String] = { + case value: Int => dateFormatter.format(value) + } + + def unapply(values: Set[Any]): Option[Seq[String]] = { + val extractables = values.toSeq.map(valueToLiteralString.lift) + if (extractables.nonEmpty && extractables.forall(_.isDefined)) { + Some(extractables.map(_.get)) + } else { + None + } + } + } + object SupportedAttribute { // hive varchar is treated as catalyst string, but hive varchar can't be pushed down. private val varcharKeys = table.getPartitionKeys.asScala @@ -711,7 +735,8 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { val resolver = SQLConf.get.resolver if (varcharKeys.exists(c => resolver(c, attr.name))) { None - } else if (attr.dataType.isInstanceOf[IntegralType] || attr.dataType == StringType) { + } else if (attr.dataType.isInstanceOf[IntegralType] || attr.dataType == StringType || + attr.dataType == DateType) { Some(attr.name) } else { None @@ -748,6 +773,10 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { convert(And(GreaterThanOrEqual(child, Literal(sortedValues.head, dataType)), LessThanOrEqual(child, Literal(sortedValues.last, dataType)))) + case InSet(child @ ExtractAttribute(SupportedAttribute(name)), ExtractableDateValues(values)) + if useAdvanced && child.dataType == DateType => + Some(convertInToOr(name, values)) + case InSet(ExtractAttribute(SupportedAttribute(name)), ExtractableValues(values)) if useAdvanced => Some(convertInToOr(name, values)) @@ -803,11 +832,12 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { override def getPartitionsByFilter( hive: Hive, table: Table, - predicates: Seq[Expression]): Seq[Partition] = { + predicates: Seq[Expression], + timeZoneId: String): Seq[Partition] = { // Hive getPartitionsByFilter() takes a string that represents partition // predicates like "str_key=\"value\" and int_key=1 ..." - val filter = convertFilters(table, predicates) + val filter = convertFilters(table, predicates, timeZoneId) val partitions = if (filter.isEmpty) { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala index 12b409e487061..6c0531182e6d6 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.hive.client +import java.sql.Date import java.util.Collections import org.apache.hadoop.hive.metastore.api.FieldSchema @@ -29,6 +30,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String /** * A set of tests for the filter conversion logic used when pushing partition pruning into the @@ -63,6 +65,28 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest { (Literal(1) === a("intcol", IntegerType)) :: (Literal("a") === a("strcol", IntegerType)) :: Nil, "1 = intcol and \"a\" = strcol") + filterTest("date filter", + (a("datecol", DateType) === Literal(Date.valueOf("2019-01-01"))) :: Nil, + "datecol = 2019-01-01") + + filterTest("date filter with IN predicate", + (a("datecol", DateType) in + (Literal(Date.valueOf("2019-01-01")), Literal(Date.valueOf("2019-01-07")))) :: Nil, + "(datecol = 2019-01-01 or datecol = 2019-01-07)") + + filterTest("date and string filter", + (Literal(Date.valueOf("2019-01-01")) === a("datecol", DateType)) :: + (Literal("a") === a("strcol", IntegerType)) :: Nil, + "2019-01-01 = datecol and \"a\" = strcol") + + filterTest("date filter with null", + (a("datecol", DateType) === Literal(null)) :: Nil, + "") + + filterTest("string filter with InSet predicate", + InSet(a("strcol", StringType), Set("1", "2").map(s => UTF8String.fromString(s))) :: Nil, + "(strcol = \"1\" or strcol = \"2\")") + filterTest("skip varchar", (Literal("") === a("varchar", StringType)) :: Nil, "") @@ -89,7 +113,7 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest { private def filterTest(name: String, filters: Seq[Expression], result: String) = { test(name) { withSQLConf(SQLConf.ADVANCED_PARTITION_PREDICATE_PUSHDOWN.key -> "true") { - val converted = shim.convertFilters(testTable, filters) + val converted = shim.convertFilters(testTable, filters, conf.sessionLocalTimeZone) if (converted != result) { fail(s"Expected ${filters.mkString(",")} to convert to '$result' but got '$converted'") } @@ -104,7 +128,7 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest { val filters = (Literal(1) === a("intcol", IntegerType) || Literal(2) === a("intcol", IntegerType)) :: Nil - val converted = shim.convertFilters(testTable, filters) + val converted = shim.convertFilters(testTable, filters, conf.sessionLocalTimeZone) if (enabled) { assert(converted == "(1 = intcol or 2 = intcol)") } else { @@ -116,7 +140,7 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest { test("SPARK-33416: Avoid Hive metastore stack overflow when InSet predicate have many values") { def checkConverted(inSet: InSet, result: String): Unit = { - assert(shim.convertFilters(testTable, inSet :: Nil) == result) + assert(shim.convertFilters(testTable, inSet :: Nil, conf.sessionLocalTimeZone) == result) } withSQLConf(SQLConf.HIVE_METASTORE_PARTITION_PRUNING_INSET_THRESHOLD.key -> "15") { @@ -139,6 +163,11 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest { InSet(a("doublecol", DoubleType), Range(1, 20).map(s => Literal(s.toDouble).eval(EmptyRow)).toSet), "") + + checkConverted( + InSet(a("datecol", DateType), + Range(1, 20).map(d => Literal(d, DateType).eval(EmptyRow)).toSet), + "(datecol >= 1970-01-02 and datecol <= 1970-01-20)") } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala index 81186909bb167..ab83f751f1425 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.hive.client +import java.sql.Date + import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat @@ -28,7 +30,8 @@ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.types.{BooleanType, IntegerType, LongType, StringType, StructType} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{BooleanType, DateType, IntegerType, LongType, StringType, StructType} import org.apache.spark.util.Utils class HivePartitionFilteringSuite(version: String) @@ -38,15 +41,16 @@ class HivePartitionFilteringSuite(version: String) private val testPartitionCount = 3 * 5 * 4 - private def init(tryDirectSql: Boolean): HiveClient = { - val storageFormat = CatalogStorageFormat( - locationUri = None, - inputFormat = None, - outputFormat = None, - serde = None, - compressed = false, - properties = Map.empty) + private val storageFormat = CatalogStorageFormat( + locationUri = None, + inputFormat = Some(classOf[TextInputFormat].getName), + outputFormat = Some(classOf[HiveIgnoreKeyTextOutputFormat[_, _]].getName), + serde = Some(classOf[LazySimpleSerDe].getName()), + compressed = false, + properties = Map.empty + ) + private def init(tryDirectSql: Boolean): HiveClient = { val hadoopConf = new Configuration() hadoopConf.setBoolean(tryDirectSqlKey, tryDirectSql) hadoopConf.set("hive.metastore.warehouse.dir", Utils.createTempDir().toURI().toString()) @@ -58,14 +62,7 @@ class HivePartitionFilteringSuite(version: String) tableType = CatalogTableType.MANAGED, schema = tableSchema, partitionColumnNames = Seq("ds", "h", "chunk"), - storage = CatalogStorageFormat( - locationUri = None, - inputFormat = Some(classOf[TextInputFormat].getName), - outputFormat = Some(classOf[HiveIgnoreKeyTextOutputFormat[_, _]].getName), - serde = Some(classOf[LazySimpleSerDe].getName()), - compressed = false, - properties = Map.empty - )) + storage = storageFormat) client.createTable(table, ignoreIfExists = false) val partitions = @@ -102,7 +99,7 @@ class HivePartitionFilteringSuite(version: String) test(s"getPartitionsByFilter returns all partitions when $tryDirectSqlKey=false") { val client = init(false) val filteredPartitions = client.getPartitionsByFilter(client.getTable("default", "test"), - Seq(attr("ds") === 20170101)) + Seq(attr("ds") === 20170101), SQLConf.get.sessionLocalTimeZone) assert(filteredPartitions.size == testPartitionCount) } @@ -297,6 +294,63 @@ class HivePartitionFilteringSuite(version: String) day :: Nil) } + test("getPartitionsByFilter: date type pruning by metastore") { + val table = CatalogTable( + identifier = TableIdentifier("test_date", Some("default")), + tableType = CatalogTableType.MANAGED, + schema = new StructType().add("value", "int").add("part", "date"), + partitionColumnNames = Seq("part"), + storage = storageFormat) + client.createTable(table, ignoreIfExists = false) + + val partitions = + for { + date <- Seq("2019-01-01", "2019-01-02", "2019-01-03", "2019-01-04") + } yield CatalogTablePartition(Map( + "part" -> date + ), storageFormat) + assert(partitions.size == 4) + + client.createPartitions("default", "test_date", partitions, ignoreIfExists = false) + + def testDataTypeFiltering( + filterExprs: Seq[Expression], + expectedPartitionCubes: Seq[Seq[Date]]): Unit = { + val filteredPartitions = client.getPartitionsByFilter( + client.getTable("default", "test_date"), + filterExprs, + SQLConf.get.sessionLocalTimeZone) + + val expectedPartitions = expectedPartitionCubes.map { + expectedDt => + for { + dt <- expectedDt + } yield Set( + "part" -> dt.toString + ) + }.reduce(_ ++ _) + + assert(filteredPartitions.map(_.spec.toSet).toSet == expectedPartitions.toSet) + } + + val dateAttr: Attribute = AttributeReference("part", DateType)() + + testDataTypeFiltering( + Seq(dateAttr === Date.valueOf("2019-01-01")), + Seq("2019-01-01").map(Date.valueOf) :: Nil) + testDataTypeFiltering( + Seq(dateAttr > Date.valueOf("2019-01-02")), + Seq("2019-01-03", "2019-01-04").map(Date.valueOf) :: Nil) + testDataTypeFiltering( + Seq(In(dateAttr, + Seq("2019-01-01", "2019-01-02").map(d => Literal(Date.valueOf(d))))), + Seq("2019-01-01", "2019-01-02").map(Date.valueOf) :: Nil) + testDataTypeFiltering( + Seq(InSet(dateAttr, + Set("2019-01-01", "2019-01-02").map(d => Literal(Date.valueOf(d)).eval(EmptyRow)))), + Seq("2019-01-01", "2019-01-02").map(Date.valueOf) :: Nil) + } + private def testMetastorePartitionFiltering( filterExpr: Expression, expectedDs: Seq[Int], @@ -333,7 +387,7 @@ class HivePartitionFilteringSuite(version: String) val filteredPartitions = client.getPartitionsByFilter(client.getTable("default", "test"), Seq( transform(filterExpr) - )) + ), SQLConf.get.sessionLocalTimeZone) val expectedPartitionCount = expectedPartitionCubes.map { case (expectedDs, expectedH, expectedChunks) => diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala index c5c92ddad9014..d9ba6dd80e4ef 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala @@ -488,7 +488,8 @@ class VersionsSuite extends SparkFunSuite with Logging { test(s"$version: getPartitionsByFilter") { // Only one partition [1, 1] for key2 == 1 val result = client.getPartitionsByFilter(client.getTable("default", "src_part"), - Seq(EqualTo(AttributeReference("key2", IntegerType)(), Literal(1)))) + Seq(EqualTo(AttributeReference("key2", IntegerType)(), Literal(1))), + versionSpark.conf.sessionLocalTimeZone) // Hive 0.12 doesn't support getPartitionsByFilter, it ignores the filter condition. if (version != "0.12") { From 19f3b89d62932fef96e72095164920deb64ea647 Mon Sep 17 00:00:00 2001 From: Gengliang Wang Date: Wed, 25 Nov 2020 08:59:31 +0000 Subject: [PATCH 009/150] [SPARK-33549][SQL] Remove configuration spark.sql.legacy.allowCastNumericToTimestamp ### What changes were proposed in this pull request? Remove SQL configuration spark.sql.legacy.allowCastNumericToTimestamp ### Why are the changes needed? In the current master branch, there is a new configuration `spark.sql.legacy.allowCastNumericToTimestamp` which controls whether to cast Numeric types to Timestamp or not. The default value is true. After https://github.com/apache/spark/pull/30260, the type conversion between Timestamp type and Numeric type is disallowed in ANSI mode. So, we don't need to a separate configuration `spark.sql.legacy.allowCastNumericToTimestamp` for disallowing the conversion. Users just need to set `spark.sql.ansi.enabled` for the behavior. As the configuration is not in any released yet, we should remove the configuration to make things simpler. ### Does this PR introduce _any_ user-facing change? No, since the configuration is not released yet. ### How was this patch tested? Existing test cases Closes #30493 from gengliangwang/LEGACY_ALLOW_CAST_NUMERIC_TO_TIMESTAMP. Authored-by: Gengliang Wang Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/expressions/Cast.scala | 13 ++----------- .../org/apache/spark/sql/internal/SQLConf.scala | 12 ------------ .../spark/sql/catalyst/expressions/CastSuite.scala | 14 -------------- .../hive/execution/HiveCompatibilitySuite.scala | 6 ------ 4 files changed, 2 insertions(+), 43 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 5afc308e52ead..e5f11b5e74916 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -59,8 +59,7 @@ object Cast { case (StringType, TimestampType) => true case (BooleanType, TimestampType) => true case (DateType, TimestampType) => true - case (_: NumericType, TimestampType) => - SQLConf.get.getConf(SQLConf.LEGACY_ALLOW_CAST_NUMERIC_TO_TIMESTAMP) + case (_: NumericType, TimestampType) => true case (StringType, DateType) => true case (TimestampType, DateType) => true @@ -273,15 +272,7 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit TypeCheckResult.TypeCheckSuccess } else { TypeCheckResult.TypeCheckFailure( - if (child.dataType.isInstanceOf[NumericType] && dataType.isInstanceOf[TimestampType]) { - s"cannot cast ${child.dataType.catalogString} to ${dataType.catalogString}," + - "you can enable the casting by setting " + - s"${SQLConf.LEGACY_ALLOW_CAST_NUMERIC_TO_TIMESTAMP.key} to true," + - "but we strongly recommend using function " + - "TIMESTAMP_SECONDS/TIMESTAMP_MILLIS/TIMESTAMP_MICROS instead." - } else { - s"cannot cast ${child.dataType.catalogString} to ${dataType.catalogString}" - }) + s"cannot cast ${child.dataType.catalogString} to ${dataType.catalogString}") } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index ef974dc176e51..0738478888aeb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2824,15 +2824,6 @@ object SQLConf { .checkValue(_ > 0, "The timeout value must be positive") .createWithDefault(10L) - val LEGACY_ALLOW_CAST_NUMERIC_TO_TIMESTAMP = - buildConf("spark.sql.legacy.allowCastNumericToTimestamp") - .internal() - .doc("When true, allow casting numeric to timestamp," + - "when false, forbid the cast, more details in SPARK-31710") - .version("3.1.0") - .booleanConf - .createWithDefault(true) - val COALESCE_BUCKETS_IN_JOIN_ENABLED = buildConf("spark.sql.bucketing.coalesceBucketsInJoin.enabled") .doc("When true, if two bucketed tables with the different number of buckets are joined, " + @@ -3550,9 +3541,6 @@ class SQLConf extends Serializable with Logging { def integerGroupingIdEnabled: Boolean = getConf(SQLConf.LEGACY_INTEGER_GROUPING_ID) - def legacyAllowCastNumericToTimestamp: Boolean = - getConf(SQLConf.LEGACY_ALLOW_CAST_NUMERIC_TO_TIMESTAMP) - def metadataCacheTTL: Long = getConf(StaticSQLConf.METADATA_CACHE_TTL_SECONDS) def coalesceBucketsInJoinEnabled: Boolean = getConf(SQLConf.COALESCE_BUCKETS_IN_JOIN_ENABLED) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala index afb76d8a5a68c..2bc27ad35efff 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala @@ -1311,20 +1311,6 @@ class CastSuite extends CastSuiteBase { } } - test("SPARK-31710: fail casting from numeric to timestamp if it is forbidden") { - Seq(true, false).foreach { enable => - withSQLConf(SQLConf.LEGACY_ALLOW_CAST_NUMERIC_TO_TIMESTAMP.key -> enable.toString) { - assert(cast(2.toByte, TimestampType).resolved == enable) - assert(cast(10.toShort, TimestampType).resolved == enable) - assert(cast(3, TimestampType).resolved == enable) - assert(cast(10L, TimestampType).resolved == enable) - assert(cast(Decimal(1.2), TimestampType).resolved == enable) - assert(cast(1.7f, TimestampType).resolved == enable) - assert(cast(2.3d, TimestampType).resolved == enable) - } - } - } - test("SPARK-32828: cast from a derived user-defined type to a base type") { val v = Literal.create(Row(1), new ExampleSubTypeUDT()) checkEvaluation(cast(v, new ExampleBaseTypeUDT), Row(1)) diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index d9b6bb43c2b47..462206d8c546f 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -40,8 +40,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { private val originalInMemoryPartitionPruning = TestHive.conf.inMemoryPartitionPruning private val originalCrossJoinEnabled = TestHive.conf.crossJoinEnabled private val originalSessionLocalTimeZone = TestHive.conf.sessionLocalTimeZone - private val originalLegacyAllowCastNumericToTimestamp = - TestHive.conf.legacyAllowCastNumericToTimestamp def testCases: Seq[(String, File)] = { hiveQueryDir.listFiles.map(f => f.getName.stripSuffix(".q") -> f) @@ -61,8 +59,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { // Fix session local timezone to America/Los_Angeles for those timezone sensitive tests // (timestamp_*) TestHive.setConf(SQLConf.SESSION_LOCAL_TIMEZONE, "America/Los_Angeles") - // Ensures that cast numeric to timestamp enabled so that we can test them - TestHive.setConf(SQLConf.LEGACY_ALLOW_CAST_NUMERIC_TO_TIMESTAMP, true) RuleExecutor.resetMetrics() } @@ -73,8 +69,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning) TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, originalCrossJoinEnabled) TestHive.setConf(SQLConf.SESSION_LOCAL_TIMEZONE, originalSessionLocalTimeZone) - TestHive.setConf(SQLConf.LEGACY_ALLOW_CAST_NUMERIC_TO_TIMESTAMP, - originalLegacyAllowCastNumericToTimestamp) // For debugging dump some statistics about how much time was spent in various optimizer rules logWarning(RuleExecutor.dumpTimeSpent()) From 2c5cc36e3f59011009c3c6083e0d0c1c81857cbd Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Wed, 25 Nov 2020 12:41:53 +0000 Subject: [PATCH 010/150] [SPARK-33509][SQL] List partition by names from a V2 table which supports partition management ### What changes were proposed in this pull request? 1. Add new method `listPartitionByNames` to the `SupportsPartitionManagement` interface. It allows to list partitions by partition names and their values. 2. Implement new method in `InMemoryPartitionTable` which is used in DSv2 tests. ### Why are the changes needed? Currently, the `SupportsPartitionManagement` interface exposes only `listPartitionIdentifiers` which allows to list partitions by partition values. And it requires to specify all values for partition schema fields in the prefix. This restriction does not allow to list partitions by some of partition names (not all of them). For example, the table `tableA` is partitioned by two column `year` and `month` ``` CREATE TABLE tableA (price int, year int, month int) USING _ partitioned by (year, month) ``` and has the following partitions: ``` PARTITION(year = 2015, month = 1) PARTITION(year = 2015, month = 2) PARTITION(year = 2016, month = 2) PARTITION(year = 2016, month = 3) ``` If we want to list all partitions with `month = 2`, we have to specify `year` for **listPartitionIdentifiers()** which not always possible as we don't know all `year` values in advance. New method **listPartitionByNames()** allows to specify partition values only for `month`, and get two partitions: ``` PARTITION(year = 2015, month = 2) PARTITION(year = 2016, month = 2) ``` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? By running the affected test suite `SupportsPartitionManagementSuite`. Closes #30452 from MaxGekk/column-names-listPartitionIdentifiers. Authored-by: Max Gekk Signed-off-by: Wenchen Fan --- .../catalog/SupportsPartitionManagement.java | 11 ++++- .../connector/InMemoryPartitionTable.scala | 22 ++++++++++ .../SupportsPartitionManagementSuite.scala | 43 ++++++++++++++++++- 3 files changed, 74 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsPartitionManagement.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsPartitionManagement.java index 446ea1463309f..380717d2e0e9b 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsPartitionManagement.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsPartitionManagement.java @@ -106,10 +106,19 @@ Map loadPartitionMetadata(InternalRow ident) throws UnsupportedOperationException; /** - * List the identifiers of all partitions that contains the ident in a table. + * List the identifiers of all partitions that have the ident prefix in a table. * * @param ident a prefix of partition identifier * @return an array of Identifiers for the partitions */ InternalRow[] listPartitionIdentifiers(InternalRow ident); + + /** + * List the identifiers of all partitions that match to the ident by names. + * + * @param names the names of partition values in the identifier. + * @param ident a partition identifier values. + * @return an array of Identifiers for the partitions + */ + InternalRow[] listPartitionByNames(String[] names, InternalRow ident); } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryPartitionTable.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryPartitionTable.scala index 23987e909aa70..ba762a58b1e52 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryPartitionTable.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryPartitionTable.scala @@ -24,6 +24,7 @@ import scala.collection.JavaConverters._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, PartitionAlreadyExistsException} +import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.connector.catalog.SupportsPartitionManagement import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.types.StructType @@ -96,4 +97,25 @@ class InMemoryPartitionTable( override protected def addPartitionKey(key: Seq[Any]): Unit = { memoryTablePartitions.put(InternalRow.fromSeq(key), Map.empty[String, String].asJava) } + + override def listPartitionByNames( + names: Array[String], + ident: InternalRow): Array[InternalRow] = { + assert(names.length == ident.numFields, + s"Number of partition names (${names.length}) must be equal to " + + s"the number of partition values (${ident.numFields}).") + val schema = partitionSchema + assert(names.forall(fieldName => schema.fieldNames.contains(fieldName)), + s"Some partition names ${names.mkString("[", ", ", "]")} don't belong to " + + s"the partition schema '${schema.sql}'.") + val indexes = names.map(schema.fieldIndex) + val dataTypes = names.map(schema(_).dataType) + val currentRow = new GenericInternalRow(new Array[Any](names.length)) + memoryTablePartitions.keySet().asScala.filter { key => + for (i <- 0 until names.length) { + currentRow.values(i) = key.get(indexes(i), dataTypes(i)) + } + currentRow == ident + }.toArray + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/SupportsPartitionManagementSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/SupportsPartitionManagementSuite.scala index e8e28e3422f27..caf7e91612563 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/SupportsPartitionManagementSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/SupportsPartitionManagementSuite.scala @@ -23,7 +23,7 @@ import scala.collection.JavaConverters._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.connector.{InMemoryPartitionTable, InMemoryTableCatalog} +import org.apache.spark.sql.connector.{InMemoryPartitionTable, InMemoryPartitionTableCatalog, InMemoryTableCatalog} import org.apache.spark.sql.connector.expressions.{LogicalExpressions, NamedReference} import org.apache.spark.sql.types.{IntegerType, StringType, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -140,4 +140,45 @@ class SupportsPartitionManagementSuite extends SparkFunSuite { partTable.dropPartition(partIdent1) assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) } + + test("listPartitionByNames") { + val partCatalog = new InMemoryPartitionTableCatalog + partCatalog.initialize("test", CaseInsensitiveStringMap.empty()) + val table = partCatalog.createTable( + ident, + new StructType() + .add("col0", IntegerType) + .add("part0", IntegerType) + .add("part1", StringType), + Array(LogicalExpressions.identity(ref("part0")), LogicalExpressions.identity(ref("part1"))), + util.Collections.emptyMap[String, String]) + val partTable = table.asInstanceOf[InMemoryPartitionTable] + + Seq( + InternalRow(0, "abc"), + InternalRow(0, "def"), + InternalRow(1, "abc")).foreach { partIdent => + partTable.createPartition(partIdent, new util.HashMap[String, String]()) + } + + Seq( + (Array("part0", "part1"), InternalRow(0, "abc")) -> Set(InternalRow(0, "abc")), + (Array("part0"), InternalRow(0)) -> Set(InternalRow(0, "abc"), InternalRow(0, "def")), + (Array("part1"), InternalRow("abc")) -> Set(InternalRow(0, "abc"), InternalRow(1, "abc")), + (Array.empty[String], InternalRow.empty) -> + Set(InternalRow(0, "abc"), InternalRow(0, "def"), InternalRow(1, "abc")), + (Array("part0", "part1"), InternalRow(3, "xyz")) -> Set(), + (Array("part1"), InternalRow(3.14f)) -> Set() + ).foreach { case ((names, idents), expected) => + assert(partTable.listPartitionByNames(names, idents).toSet === expected) + } + // Check invalid parameters + Seq( + (Array("part0", "part1"), InternalRow(0)), + (Array("col0", "part1"), InternalRow(0, 1)), + (Array("wrong"), InternalRow("invalid")) + ).foreach { case (names, idents) => + intercept[AssertionError](partTable.listPartitionByNames(names, idents)) + } + } } From 7c59aeeef4c571838bd291079f9b804d6f546487 Mon Sep 17 00:00:00 2001 From: duripeng Date: Wed, 25 Nov 2020 12:50:21 +0000 Subject: [PATCH 011/150] [SPARK-27194][SPARK-29302][SQL] Fix commit collision in dynamic partition overwrite mode ### What changes were proposed in this pull request? When using dynamic partition overwrite, each task has its working dir under staging dir like `stagingDir/.spark-staging-{jobId}`, each task commits to `outputPath/.spark-staging-{jobId}/{partitionId}/part-{taskId}-{jobId}{ext}`. When speculation enable, multiple task attempts would be setup for one task, **they have same task id and they would commit to same file concurrently**. Due to host done or node preemption, the partly-committed files aren't cleaned up, a FileAlreadyExistsException would be raised in this situation, resulting in job failure. I don't try to change task commit process for dynamic partition overwrite, like adding attempt id to task working dir for each attempts and committing to final output dir via a new outputCommitCoordinator, here is reason: 1. `FileOutputCommitter` already has commit coordinator for each task attempts, we can leverage it rather than build a new one. 2. To say the least, we implement a coordinator solving task attempts commit conflict, suppose a severe case, application master failover, tasks with same attempt id and same task id would commit to same files, the `FileAlreadyExistsException` risk still exists In this pr, I leverage FileOutputCommitter to solve the problem: 1. when initing a write job description, set `outputPath/.spark-staging-{jobId}` as the output dir 2. each task attempt writes output to `outputPath/.spark-staging-{jobId}/_temporary/${appAttemptId}/_temporary/${taskAttemptId}/{partitionId}/part-{taskId}-{jobId}{ext}` 3. leverage `FileOutputCommitter` coordinator, write job firstly commits output to `outputPath/.spark-staging-{jobId}/{partitionId}` 4. for dynamic partition overwrite, write job finally move `outputPath/.spark-staging-{jobId}/{partitionId}` to `outputPath/{partitionId}` ### Why are the changes needed? Without this pr, dynamic partition overwrite would fail ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? added UT. Closes #29000 from WinkerDu/master-fix-dynamic-partition-multi-commit. Authored-by: duripeng Signed-off-by: Wenchen Fan --- .../internal/io/FileCommitProtocol.scala | 4 ++ .../io/HadoopMapReduceCommitProtocol.scala | 41 +++++++++++----- .../InsertIntoHadoopFsRelationCommand.scala | 14 +++++- .../SQLHadoopMapReduceCommitProtocol.scala | 3 +- .../sql/sources/PartitionedWriteSuite.scala | 47 ++++++++++++++++++- 5 files changed, 92 insertions(+), 17 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala index 0746e43babf9a..d9d7b06cdb8ce 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala @@ -169,4 +169,8 @@ object FileCommitProtocol extends Logging { ctor.newInstance(jobId, outputPath) } } + + def getStagingDir(path: String, jobId: String): Path = { + new Path(path, ".spark-staging-" + jobId) + } } diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala index 11ce608f52ee2..30f9a650a69c9 100644 --- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala +++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala @@ -41,13 +41,28 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil * @param jobId the job's or stage's id * @param path the job's output path, or null if committer acts as a noop * @param dynamicPartitionOverwrite If true, Spark will overwrite partition directories at runtime - * dynamically, i.e., we first write files under a staging - * directory with partition path, e.g. - * /path/to/staging/a=1/b=1/xxx.parquet. When committing the job, - * we first clean up the corresponding partition directories at - * destination path, e.g. /path/to/destination/a=1/b=1, and move - * files from staging directory to the corresponding partition - * directories under destination path. + * dynamically. Suppose final path is /path/to/outputPath, output + * path of [[FileOutputCommitter]] is an intermediate path, e.g. + * /path/to/outputPath/.spark-staging-{jobId}, which is a staging + * directory. Task attempts firstly write files under the + * intermediate path, e.g. + * /path/to/outputPath/.spark-staging-{jobId}/_temporary/ + * {appAttemptId}/_temporary/{taskAttemptId}/a=1/b=1/xxx.parquet. + * + * 1. When [[FileOutputCommitter]] algorithm version set to 1, + * we firstly move task attempt output files to + * /path/to/outputPath/.spark-staging-{jobId}/_temporary/ + * {appAttemptId}/{taskId}/a=1/b=1, + * then move them to + * /path/to/outputPath/.spark-staging-{jobId}/a=1/b=1. + * 2. When [[FileOutputCommitter]] algorithm version set to 2, + * committing tasks directly move task attempt output files to + * /path/to/outputPath/.spark-staging-{jobId}/a=1/b=1. + * + * At the end of committing job, we move output files from + * intermediate path to final path, e.g., move files from + * /path/to/outputPath/.spark-staging-{jobId}/a=1/b=1 + * to /path/to/outputPath/a=1/b=1 */ class HadoopMapReduceCommitProtocol( jobId: String, @@ -89,7 +104,7 @@ class HadoopMapReduceCommitProtocol( * The staging directory of this write job. Spark uses it to deal with files with absolute output * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true. */ - private def stagingDir = new Path(path, ".spark-staging-" + jobId) + protected def stagingDir = getStagingDir(path, jobId) protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val format = context.getOutputFormatClass.getConstructor().newInstance() @@ -106,13 +121,13 @@ class HadoopMapReduceCommitProtocol( val filename = getFilename(taskContext, ext) val stagingDir: Path = committer match { - case _ if dynamicPartitionOverwrite => - assert(dir.isDefined, - "The dataset to be written must be partitioned when dynamicPartitionOverwrite is true.") - partitionPaths += dir.get - this.stagingDir // For FileOutputCommitter it has its own staging path called "work path". case f: FileOutputCommitter => + if (dynamicPartitionOverwrite) { + assert(dir.isDefined, + "The dataset to be written must be partitioned when dynamicPartitionOverwrite is true.") + partitionPaths += dir.get + } new Path(Option(f.getWorkPath).map(_.toString).getOrElse(path)) case _ => new Path(path) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index fe733f4238e1a..db7264d0c6ec8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -106,9 +106,10 @@ case class InsertIntoHadoopFsRelationCommand( fs, catalogTable.get, qualifiedOutputPath, matchingPartitions) } + val jobId = java.util.UUID.randomUUID().toString val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, - jobId = java.util.UUID.randomUUID().toString, + jobId = jobId, outputPath = outputPath.toString, dynamicPartitionOverwrite = dynamicPartitionOverwrite) @@ -163,6 +164,15 @@ case class InsertIntoHadoopFsRelationCommand( } } + // For dynamic partition overwrite, FileOutputCommitter's output path is staging path, files + // will be renamed from staging path to final output path during commit job + val committerOutputPath = if (dynamicPartitionOverwrite) { + FileCommitProtocol.getStagingDir(outputPath.toString, jobId) + .makeQualified(fs.getUri, fs.getWorkingDirectory) + } else { + qualifiedOutputPath + } + val updatedPartitionPaths = FileFormatWriter.write( sparkSession = sparkSession, @@ -170,7 +180,7 @@ case class InsertIntoHadoopFsRelationCommand( fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec( - qualifiedOutputPath.toString, customPartitionLocations, outputColumns), + committerOutputPath.toString, customPartitionLocations, outputColumns), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = bucketSpec, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala index 39c594a9bc618..144be2316f091 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala @@ -55,7 +55,8 @@ class SQLHadoopMapReduceCommitProtocol( // The specified output committer is a FileOutputCommitter. // So, we will use the FileOutputCommitter-specified constructor. val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext]) - committer = ctor.newInstance(new Path(path), context) + val committerOutputPath = if (dynamicPartitionOverwrite) stagingDir else new Path(path) + committer = ctor.newInstance(committerOutputPath, context) } else { // The specified output committer is just an OutputCommitter. // So, we will use the no-argument constructor. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala index 6df1c5db14c26..52825a155e46a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PartitionedWriteSuite.scala @@ -20,7 +20,8 @@ package org.apache.spark.sql.sources import java.io.File import java.sql.Timestamp -import org.apache.hadoop.mapreduce.TaskAttemptContext +import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.spark.TestUtils import org.apache.spark.internal.Logging @@ -164,4 +165,48 @@ class PartitionedWriteSuite extends QueryTest with SharedSparkSession { assert(e.getMessage.contains("Found duplicate column(s) b, b: `b`;")) } } + + test("SPARK-27194 SPARK-29302: Fix commit collision in dynamic partition overwrite mode") { + withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> + SQLConf.PartitionOverwriteMode.DYNAMIC.toString, + SQLConf.FILE_COMMIT_PROTOCOL_CLASS.key -> + classOf[PartitionFileExistCommitProtocol].getName) { + withTempDir { d => + withTable("t") { + sql( + s""" + | create table t(c1 int, p1 int) using parquet partitioned by (p1) + | location '${d.getAbsolutePath}' + """.stripMargin) + + val df = Seq((1, 2)).toDF("c1", "p1") + df.write + .partitionBy("p1") + .mode("overwrite") + .saveAsTable("t") + checkAnswer(sql("select * from t"), df) + } + } + } + } +} + +/** + * A file commit protocol with pre-created partition file. when try to overwrite partition dir + * in dynamic partition mode, FileAlreadyExist exception would raise without SPARK-27194 + */ +private class PartitionFileExistCommitProtocol( + jobId: String, + path: String, + dynamicPartitionOverwrite: Boolean) + extends SQLHadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) { + override def setupJob(jobContext: JobContext): Unit = { + super.setupJob(jobContext) + val stagingDir = new File(new Path(path).toUri.getPath, s".spark-staging-$jobId") + stagingDir.mkdirs() + val stagingPartDir = new File(stagingDir, "p1=2") + stagingPartDir.mkdirs() + val conflictTaskFile = new File(stagingPartDir, s"part-00000-$jobId.c000.snappy.parquet") + conflictTaskFile.createNewFile() + } } From 6f68ccf532ec3fdd7224ba05c52bce58372572e9 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Wed, 25 Nov 2020 15:09:02 +0000 Subject: [PATCH 012/150] [SPARK-31257][SPARK-33561][SQL] Unify create table syntax ### What changes were proposed in this pull request? * Unify the create table syntax in the parser by merging Hive and DataSource clauses * Add `SerdeInfo` and `external` boolean to statement plans and update AstBuilder to produce them * Add conversion from create statement plan to v1 create plans in ResolveSessionCatalog * Support new statement clauses in ResolveCatalogs conversion to v2 create plans * Remove SparkSqlParser rules for Hive syntax * Add "option." namespace to distinguish SERDEPROPERTIES and OPTIONS in table properties ### Why are the changes needed? * Current behavior is confusing. * A way to pass the Hive create options to DSv2 is needed for a Hive source. ### Does this PR introduce any user-facing change? Not by default, but v2 sources will be able to handle STORED AS and other Hive clauses. ### How was this patch tested? Existing tests validate there are no behavior changes. Update unit tests for using a statement plan for Hive create syntax: * Move create tests from spark-sql DDLParserSuite into PlanResolutionSuite * Add parser tests to spark-catalyst DDLParserSuite Closes #28026 from rdblue/unify-create-table. Lead-authored-by: Ryan Blue Co-authored-by: Wenchen Fan Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/parser/SqlBase.g4 | 29 +- .../sql/connector/catalog/TableCatalog.java | 10 + .../catalyst/analysis/ResolveCatalogs.scala | 16 +- .../sql/catalyst/parser/AstBuilder.scala | 331 +++++++-- .../catalyst/plans/logical/statements.scala | 81 +++ .../sql/connector/catalog/CatalogV2Util.scala | 55 +- .../sql/catalyst/parser/DDLParserSuite.scala | 348 ++++++++- .../apache/spark/sql/DataFrameWriter.scala | 5 +- .../apache/spark/sql/DataFrameWriterV2.scala | 5 +- .../analysis/ResolveSessionCatalog.scala | 111 ++- .../spark/sql/execution/SparkSqlParser.scala | 394 ++--------- .../datasources/v2/V2SessionCatalog.scala | 8 +- .../sql/connector/DataSourceV2SQLSuite.scala | 4 +- .../sql/execution/SparkSqlParserSuite.scala | 129 +--- .../execution/command/DDLParserSuite.scala | 524 +------------- .../command/PlanResolutionSuite.scala | 660 +++++++++++++++++- .../sources/CreateTableAsSelectSuite.scala | 4 +- .../sql/hive/execution/HiveDDLSuite.scala | 24 +- .../sql/hive/execution/HiveSerDeSuite.scala | 7 +- .../sql/hive/execution/SQLQuerySuite.scala | 3 +- 20 files changed, 1626 insertions(+), 1122 deletions(-) diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 6b6b751cc3c15..5d17028c32ae2 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -119,20 +119,9 @@ statement (RESTRICT | CASCADE)? #dropNamespace | SHOW (DATABASES | NAMESPACES) ((FROM | IN) multipartIdentifier)? (LIKE? pattern=STRING)? #showNamespaces - | createTableHeader ('(' colTypeList ')')? tableProvider + | createTableHeader ('(' colTypeList ')')? tableProvider? createTableClauses (AS? query)? #createTable - | createTableHeader ('(' columns=colTypeList ')')? - (commentSpec | - (PARTITIONED BY '(' partitionColumns=colTypeList ')' | - PARTITIONED BY partitionColumnNames=identifierList) | - bucketSpec | - skewSpec | - rowFormat | - createFileFormat | - locationSpec | - (TBLPROPERTIES tableProps=tablePropertyList))* - (AS? query)? #createHiveTable | CREATE TABLE (IF NOT EXISTS)? target=tableIdentifier LIKE source=tableIdentifier (tableProvider | @@ -140,7 +129,7 @@ statement createFileFormat | locationSpec | (TBLPROPERTIES tableProps=tablePropertyList))* #createTableLike - | replaceTableHeader ('(' colTypeList ')')? tableProvider + | replaceTableHeader ('(' colTypeList ')')? tableProvider? createTableClauses (AS? query)? #replaceTable | ANALYZE TABLE multipartIdentifier partitionSpec? COMPUTE STATISTICS @@ -393,8 +382,11 @@ tableProvider createTableClauses :((OPTIONS options=tablePropertyList) | - (PARTITIONED BY partitioning=transformList) | + (PARTITIONED BY partitioning=partitionFieldList) | + skewSpec | bucketSpec | + rowFormat | + createFileFormat | locationSpec | commentSpec | (TBLPROPERTIES tableProps=tablePropertyList))* @@ -741,8 +733,13 @@ namedExpressionSeq : namedExpression (',' namedExpression)* ; -transformList - : '(' transforms+=transform (',' transforms+=transform)* ')' +partitionFieldList + : '(' fields+=partitionField (',' fields+=partitionField)* ')' + ; + +partitionField + : transform #partitionTransform + | colType #partitionColumn ; transform diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java index 92079d127b1e3..52a74ab9dd9f5 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/TableCatalog.java @@ -46,6 +46,11 @@ public interface TableCatalog extends CatalogPlugin { */ String PROP_LOCATION = "location"; + /** + * A reserved property to specify a table was created with EXTERNAL. + */ + String PROP_EXTERNAL = "external"; + /** * A reserved property to specify the description of the table. */ @@ -61,6 +66,11 @@ public interface TableCatalog extends CatalogPlugin { */ String PROP_OWNER = "owner"; + /** + * A prefix used to pass OPTIONS in table properties + */ + String OPTION_PREFIX = "option."; + /** * List the tables in a namespace from the catalog. *

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala index deeb8215d22c6..7354d2478b7c8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala @@ -143,7 +143,7 @@ class ResolveCatalogs(val catalogManager: CatalogManager) RenameTable(catalog.asTableCatalog, oldName.asIdentifier, newNameParts.asIdentifier) case c @ CreateTableStatement( - NonSessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _) => + NonSessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _, _, _) => assertNoNullTypeInSchema(c.tableSchema) assertNoCharTypeInSchema(c.tableSchema) CreateV2Table( @@ -152,11 +152,11 @@ class ResolveCatalogs(val catalogManager: CatalogManager) c.tableSchema, // convert the bucket spec and add it as a transform c.partitioning ++ c.bucketSpec.map(_.asTransform), - convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider), + convertTableProperties(c), ignoreIfExists = c.ifNotExists) case c @ CreateTableAsSelectStatement( - NonSessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _, _) => + NonSessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _, _, _, _) => if (c.asSelect.resolved) { assertNoNullTypeInSchema(c.asSelect.schema) } @@ -166,12 +166,12 @@ class ResolveCatalogs(val catalogManager: CatalogManager) // convert the bucket spec and add it as a transform c.partitioning ++ c.bucketSpec.map(_.asTransform), c.asSelect, - convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider), + convertTableProperties(c), writeOptions = c.writeOptions, ignoreIfExists = c.ifNotExists) case c @ ReplaceTableStatement( - NonSessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _) => + NonSessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _, _) => assertNoNullTypeInSchema(c.tableSchema) assertNoCharTypeInSchema(c.tableSchema) ReplaceTable( @@ -180,11 +180,11 @@ class ResolveCatalogs(val catalogManager: CatalogManager) c.tableSchema, // convert the bucket spec and add it as a transform c.partitioning ++ c.bucketSpec.map(_.asTransform), - convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider), + convertTableProperties(c), orCreate = c.orCreate) case c @ ReplaceTableAsSelectStatement( - NonSessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _, _) => + NonSessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _, _, _) => if (c.asSelect.resolved) { assertNoNullTypeInSchema(c.asSelect.schema) } @@ -194,7 +194,7 @@ class ResolveCatalogs(val catalogManager: CatalogManager) // convert the bucket spec and add it as a transform c.partitioning ++ c.bucketSpec.map(_.asTransform), c.asSelect, - convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider), + convertTableProperties(c), writeOptions = c.writeOptions, orCreate = c.orCreate) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 5f8394c525949..25423e510157a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -2459,10 +2459,22 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg /** * Type to keep track of table clauses: - * (partitioning, bucketSpec, properties, options, location, comment). + * - partition transforms + * - partition columns + * - bucketSpec + * - properties + * - options + * - location + * - comment + * - serde + * + * Note: Partition transforms are based on existing table schema definition. It can be simple + * column names, or functions like `year(date_col)`. Partition columns are column names with data + * types like `i INT`, which should be appended to the existing table schema. */ - type TableClauses = (Seq[Transform], Option[BucketSpec], Map[String, String], - Map[String, String], Option[String], Option[String]) + type TableClauses = ( + Seq[Transform], Seq[StructField], Option[BucketSpec], Map[String, String], + Map[String, String], Option[String], Option[String], Option[SerdeInfo]) /** * Validate a create table statement and return the [[TableIdentifier]]. @@ -2495,9 +2507,22 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Parse a list of transforms. + * Parse a list of transforms or columns. */ - override def visitTransformList(ctx: TransformListContext): Seq[Transform] = withOrigin(ctx) { + override def visitPartitionFieldList( + ctx: PartitionFieldListContext): (Seq[Transform], Seq[StructField]) = withOrigin(ctx) { + val (transforms, columns) = ctx.fields.asScala.map { + case transform: PartitionTransformContext => + (Some(visitPartitionTransform(transform)), None) + case field: PartitionColumnContext => + (None, Some(visitColType(field.colType))) + }.unzip + + (transforms.flatten.toSeq, columns.flatten.toSeq) + } + + override def visitPartitionTransform( + ctx: PartitionTransformContext): Transform = withOrigin(ctx) { def getFieldReference( ctx: ApplyTransformContext, arg: V2Expression): FieldReference = { @@ -2524,7 +2549,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } } - ctx.transforms.asScala.map { + ctx.transform match { case identityCtx: IdentityTransformContext => IdentityTransform(FieldReference(typedVisit[Seq[String]](identityCtx.qualifiedName))) @@ -2563,7 +2588,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg case name => ApplyTransform(name, arguments) } - }.toSeq + } } /** @@ -2763,16 +2788,157 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg (filtered, path) } + /** + * Create a [[SerdeInfo]] for creating tables. + * + * Format: STORED AS (name | INPUTFORMAT input_format OUTPUTFORMAT output_format) + */ + override def visitCreateFileFormat(ctx: CreateFileFormatContext): SerdeInfo = withOrigin(ctx) { + (ctx.fileFormat, ctx.storageHandler) match { + // Expected format: INPUTFORMAT input_format OUTPUTFORMAT output_format + case (c: TableFileFormatContext, null) => + SerdeInfo(formatClasses = Some(FormatClasses(string(c.inFmt), string(c.outFmt)))) + // Expected format: SEQUENCEFILE | TEXTFILE | RCFILE | ORC | PARQUET | AVRO + case (c: GenericFileFormatContext, null) => + SerdeInfo(storedAs = Some(c.identifier.getText)) + case (null, storageHandler) => + operationNotAllowed("STORED BY", ctx) + case _ => + throw new ParseException("Expected either STORED AS or STORED BY, not both", ctx) + } + } + + /** + * Create a [[SerdeInfo]] used for creating tables. + * + * Example format: + * {{{ + * SERDE serde_name [WITH SERDEPROPERTIES (k1=v1, k2=v2, ...)] + * }}} + * + * OR + * + * {{{ + * DELIMITED [FIELDS TERMINATED BY char [ESCAPED BY char]] + * [COLLECTION ITEMS TERMINATED BY char] + * [MAP KEYS TERMINATED BY char] + * [LINES TERMINATED BY char] + * [NULL DEFINED AS char] + * }}} + */ + def visitRowFormat(ctx: RowFormatContext): SerdeInfo = withOrigin(ctx) { + ctx match { + case serde: RowFormatSerdeContext => visitRowFormatSerde(serde) + case delimited: RowFormatDelimitedContext => visitRowFormatDelimited(delimited) + } + } + + /** + * Create SERDE row format name and properties pair. + */ + override def visitRowFormatSerde(ctx: RowFormatSerdeContext): SerdeInfo = withOrigin(ctx) { + import ctx._ + SerdeInfo( + serde = Some(string(name)), + serdeProperties = Option(tablePropertyList).map(visitPropertyKeyValues).getOrElse(Map.empty)) + } + + /** + * Create a delimited row format properties object. + */ + override def visitRowFormatDelimited( + ctx: RowFormatDelimitedContext): SerdeInfo = withOrigin(ctx) { + // Collect the entries if any. + def entry(key: String, value: Token): Seq[(String, String)] = { + Option(value).toSeq.map(x => key -> string(x)) + } + // TODO we need proper support for the NULL format. + val entries = + entry("field.delim", ctx.fieldsTerminatedBy) ++ + entry("serialization.format", ctx.fieldsTerminatedBy) ++ + entry("escape.delim", ctx.escapedBy) ++ + // The following typo is inherited from Hive... + entry("colelction.delim", ctx.collectionItemsTerminatedBy) ++ + entry("mapkey.delim", ctx.keysTerminatedBy) ++ + Option(ctx.linesSeparatedBy).toSeq.map { token => + val value = string(token) + validate( + value == "\n", + s"LINES TERMINATED BY only supports newline '\\n' right now: $value", + ctx) + "line.delim" -> value + } + SerdeInfo(serdeProperties = entries.toMap) + } + + /** + * Throw a [[ParseException]] if the user specified incompatible SerDes through ROW FORMAT + * and STORED AS. + * + * The following are allowed. Anything else is not: + * ROW FORMAT SERDE ... STORED AS [SEQUENCEFILE | RCFILE | TEXTFILE] + * ROW FORMAT DELIMITED ... STORED AS TEXTFILE + * ROW FORMAT ... STORED AS INPUTFORMAT ... OUTPUTFORMAT ... + */ + protected def validateRowFormatFileFormat( + rowFormatCtx: RowFormatContext, + createFileFormatCtx: CreateFileFormatContext, + parentCtx: ParserRuleContext): Unit = { + if (rowFormatCtx == null || createFileFormatCtx == null) { + return + } + (rowFormatCtx, createFileFormatCtx.fileFormat) match { + case (_, ffTable: TableFileFormatContext) => // OK + case (rfSerde: RowFormatSerdeContext, ffGeneric: GenericFileFormatContext) => + ffGeneric.identifier.getText.toLowerCase(Locale.ROOT) match { + case ("sequencefile" | "textfile" | "rcfile") => // OK + case fmt => + operationNotAllowed( + s"ROW FORMAT SERDE is incompatible with format '$fmt', which also specifies a serde", + parentCtx) + } + case (rfDelimited: RowFormatDelimitedContext, ffGeneric: GenericFileFormatContext) => + ffGeneric.identifier.getText.toLowerCase(Locale.ROOT) match { + case "textfile" => // OK + case fmt => operationNotAllowed( + s"ROW FORMAT DELIMITED is only compatible with 'textfile', not '$fmt'", parentCtx) + } + case _ => + // should never happen + def str(ctx: ParserRuleContext): String = { + (0 until ctx.getChildCount).map { i => ctx.getChild(i).getText }.mkString(" ") + } + operationNotAllowed( + s"Unexpected combination of ${str(rowFormatCtx)} and ${str(createFileFormatCtx)}", + parentCtx) + } + } + + protected def validateRowFormatFileFormat( + rowFormatCtx: Seq[RowFormatContext], + createFileFormatCtx: Seq[CreateFileFormatContext], + parentCtx: ParserRuleContext): Unit = { + if (rowFormatCtx.size == 1 && createFileFormatCtx.size == 1) { + validateRowFormatFileFormat(rowFormatCtx.head, createFileFormatCtx.head, parentCtx) + } + } + override def visitCreateTableClauses(ctx: CreateTableClausesContext): TableClauses = { checkDuplicateClauses(ctx.TBLPROPERTIES, "TBLPROPERTIES", ctx) checkDuplicateClauses(ctx.OPTIONS, "OPTIONS", ctx) checkDuplicateClauses(ctx.PARTITIONED, "PARTITIONED BY", ctx) + checkDuplicateClauses(ctx.createFileFormat, "STORED AS/BY", ctx) + checkDuplicateClauses(ctx.rowFormat, "ROW FORMAT", ctx) checkDuplicateClauses(ctx.commentSpec(), "COMMENT", ctx) checkDuplicateClauses(ctx.bucketSpec(), "CLUSTERED BY", ctx) checkDuplicateClauses(ctx.locationSpec, "LOCATION", ctx) - val partitioning: Seq[Transform] = - Option(ctx.partitioning).map(visitTransformList).getOrElse(Nil) + if (ctx.skewSpec.size > 0) { + operationNotAllowed("CREATE TABLE ... SKEWED BY", ctx) + } + + val (partTransforms, partCols) = + Option(ctx.partitioning).map(visitPartitionFieldList).getOrElse((Nil, Nil)) val bucketSpec = ctx.bucketSpec().asScala.headOption.map(visitBucketSpec) val properties = Option(ctx.tableProps).map(visitPropertyKeyValues).getOrElse(Map.empty) val cleanedProperties = cleanTableProperties(ctx, properties) @@ -2780,7 +2946,45 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg val location = visitLocationSpecList(ctx.locationSpec()) val (cleanedOptions, newLocation) = cleanTableOptions(ctx, options, location) val comment = visitCommentSpecList(ctx.commentSpec()) - (partitioning, bucketSpec, cleanedProperties, cleanedOptions, newLocation, comment) + val serdeInfo = getSerdeInfo(ctx.rowFormat.asScala, ctx.createFileFormat.asScala, ctx) + (partTransforms, partCols, bucketSpec, cleanedProperties, cleanedOptions, newLocation, comment, + serdeInfo) + } + + protected def getSerdeInfo( + rowFormatCtx: Seq[RowFormatContext], + createFileFormatCtx: Seq[CreateFileFormatContext], + ctx: ParserRuleContext, + skipCheck: Boolean = false): Option[SerdeInfo] = { + if (!skipCheck) validateRowFormatFileFormat(rowFormatCtx, createFileFormatCtx, ctx) + val rowFormatSerdeInfo = rowFormatCtx.map(visitRowFormat) + val fileFormatSerdeInfo = createFileFormatCtx.map(visitCreateFileFormat) + (fileFormatSerdeInfo ++ rowFormatSerdeInfo).reduceLeftOption((l, r) => l.merge(r)) + } + + private def partitionExpressions( + partTransforms: Seq[Transform], + partCols: Seq[StructField], + ctx: ParserRuleContext): Seq[Transform] = { + if (partTransforms.nonEmpty) { + if (partCols.nonEmpty) { + val references = partTransforms.map(_.describe()).mkString(", ") + val columns = partCols + .map(field => s"${field.name} ${field.dataType.simpleString}") + .mkString(", ") + operationNotAllowed( + s"""PARTITION BY: Cannot mix partition expressions and partition columns: + |Expressions: $references + |Columns: $columns""".stripMargin, ctx) + + } + partTransforms + } else { + // columns were added to create the schema. convert to column references + partCols.map { column => + IdentityTransform(FieldReference(Seq(column.name))) + } + } } /** @@ -2789,13 +2993,15 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg * Expected format: * {{{ * CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db_name.]table_name - * USING table_provider + * [USING table_provider] * create_table_clauses * [[AS] select_statement]; * * create_table_clauses (order insensitive): + * [PARTITIONED BY (partition_fields)] * [OPTIONS table_property_list] - * [PARTITIONED BY (col_name, transform(col_name), transform(constant, col_name), ...)] + * [ROW FORMAT row_format] + * [STORED AS file_format] * [CLUSTERED BY (col_name, col_name, ...) * [SORTED BY (col_name [ASC|DESC], ...)] * INTO num_buckets BUCKETS @@ -2803,40 +3009,55 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg * [LOCATION path] * [COMMENT table_comment] * [TBLPROPERTIES (property_name=property_value, ...)] + * + * partition_fields: + * col_name, transform(col_name), transform(constant, col_name), ... | + * col_name data_type [NOT NULL] [COMMENT col_comment], ... * }}} */ override def visitCreateTable(ctx: CreateTableContext): LogicalPlan = withOrigin(ctx) { val (table, temp, ifNotExists, external) = visitCreateTableHeader(ctx.createTableHeader) - if (external) { - operationNotAllowed("CREATE EXTERNAL TABLE ...", ctx) - } - val schema = Option(ctx.colTypeList()).map(createSchema) + + val columns = Option(ctx.colTypeList()).map(visitColTypeList).getOrElse(Nil) val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText) - val (partitioning, bucketSpec, properties, options, location, comment) = + val (partTransforms, partCols, bucketSpec, properties, options, location, comment, serdeInfo) = visitCreateTableClauses(ctx.createTableClauses()) - Option(ctx.query).map(plan) match { - case Some(_) if temp => - operationNotAllowed("CREATE TEMPORARY TABLE ... USING ... AS query", ctx) + if (provider.isDefined && serdeInfo.isDefined) { + operationNotAllowed(s"CREATE TABLE ... USING ... ${serdeInfo.get.describe}", ctx) + } + + if (temp) { + val asSelect = if (ctx.query == null) "" else " AS ..." + operationNotAllowed( + s"CREATE TEMPORARY TABLE ...$asSelect, use CREATE TEMPORARY VIEW instead", ctx) + } - case Some(_) if schema.isDefined => + val partitioning = partitionExpressions(partTransforms, partCols, ctx) + + Option(ctx.query).map(plan) match { + case Some(_) if columns.nonEmpty => operationNotAllowed( "Schema may not be specified in a Create Table As Select (CTAS) statement", ctx) + case Some(_) if partCols.nonEmpty => + // non-reference partition columns are not allowed because schema can't be specified + operationNotAllowed( + "Partition column types may not be specified in Create Table As Select (CTAS)", + ctx) + case Some(query) => CreateTableAsSelectStatement( table, query, partitioning, bucketSpec, properties, provider, options, location, comment, - writeOptions = Map.empty, ifNotExists = ifNotExists) - - case None if temp => - // CREATE TEMPORARY TABLE ... USING ... is not supported by the catalyst parser. - // Use CREATE TEMPORARY VIEW ... USING ... instead. - operationNotAllowed("CREATE TEMPORARY TABLE IF NOT EXISTS", ctx) + writeOptions = Map.empty, serdeInfo, external = external, ifNotExists = ifNotExists) case _ => - CreateTableStatement(table, schema.getOrElse(new StructType), partitioning, bucketSpec, - properties, provider, options, location, comment, ifNotExists = ifNotExists) + // Note: table schema includes both the table columns list and the partition columns + // with data type. + val schema = StructType(columns ++ partCols) + CreateTableStatement(table, schema, partitioning, bucketSpec, properties, provider, + options, location, comment, serdeInfo, external = external, ifNotExists = ifNotExists) } } @@ -2846,13 +3067,13 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg * Expected format: * {{{ * [CREATE OR] REPLACE TABLE [db_name.]table_name - * USING table_provider + * [USING table_provider] * replace_table_clauses * [[AS] select_statement]; * * replace_table_clauses (order insensitive): * [OPTIONS table_property_list] - * [PARTITIONED BY (col_name, transform(col_name), transform(constant, col_name), ...)] + * [PARTITIONED BY (partition_fields)] * [CLUSTERED BY (col_name, col_name, ...) * [SORTED BY (col_name [ASC|DESC], ...)] * INTO num_buckets BUCKETS @@ -2860,33 +3081,63 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg * [LOCATION path] * [COMMENT table_comment] * [TBLPROPERTIES (property_name=property_value, ...)] + * + * partition_fields: + * col_name, transform(col_name), transform(constant, col_name), ... | + * col_name data_type [NOT NULL] [COMMENT col_comment], ... * }}} */ override def visitReplaceTable(ctx: ReplaceTableContext): LogicalPlan = withOrigin(ctx) { - val (table, _, ifNotExists, external) = visitReplaceTableHeader(ctx.replaceTableHeader) + val (table, temp, ifNotExists, external) = visitReplaceTableHeader(ctx.replaceTableHeader) + val orCreate = ctx.replaceTableHeader().CREATE() != null + + if (temp) { + val action = if (orCreate) "CREATE OR REPLACE" else "REPLACE" + operationNotAllowed(s"$action TEMPORARY TABLE ..., use $action TEMPORARY VIEW instead.", ctx) + } + if (external) { - operationNotAllowed("REPLACE EXTERNAL TABLE ... USING", ctx) + operationNotAllowed("REPLACE EXTERNAL TABLE ...", ctx) + } + + if (ifNotExists) { + operationNotAllowed("REPLACE ... IF NOT EXISTS, use CREATE IF NOT EXISTS instead", ctx) } - val (partitioning, bucketSpec, properties, options, location, comment) = + val (partTransforms, partCols, bucketSpec, properties, options, location, comment, serdeInfo) = visitCreateTableClauses(ctx.createTableClauses()) - val schema = Option(ctx.colTypeList()).map(createSchema) + val columns = Option(ctx.colTypeList()).map(visitColTypeList).getOrElse(Nil) val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText) - val orCreate = ctx.replaceTableHeader().CREATE() != null + + if (provider.isDefined && serdeInfo.isDefined) { + operationNotAllowed(s"CREATE TABLE ... USING ... ${serdeInfo.get.describe}", ctx) + } + + val partitioning = partitionExpressions(partTransforms, partCols, ctx) Option(ctx.query).map(plan) match { - case Some(_) if schema.isDefined => + case Some(_) if columns.nonEmpty => operationNotAllowed( "Schema may not be specified in a Replace Table As Select (RTAS) statement", ctx) + case Some(_) if partCols.nonEmpty => + // non-reference partition columns are not allowed because schema can't be specified + operationNotAllowed( + "Partition column types may not be specified in Replace Table As Select (RTAS)", + ctx) + case Some(query) => ReplaceTableAsSelectStatement(table, query, partitioning, bucketSpec, properties, - provider, options, location, comment, writeOptions = Map.empty, orCreate = orCreate) + provider, options, location, comment, writeOptions = Map.empty, serdeInfo, + orCreate = orCreate) case _ => - ReplaceTableStatement(table, schema.getOrElse(new StructType), partitioning, - bucketSpec, properties, provider, options, location, comment, orCreate = orCreate) + // Note: table schema includes both the table columns list and the partition columns + // with data type. + val schema = StructType(columns ++ partCols) + ReplaceTableStatement(table, schema, partitioning, bucketSpec, properties, provider, + options, location, comment, serdeInfo, orCreate = orCreate) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala index 3660e8a95a7f6..281d57b3648f4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala @@ -53,6 +53,81 @@ abstract class ParsedStatement extends LogicalPlan { final override lazy val resolved = false } +/** + * Type to keep track of Hive serde info + */ +case class SerdeInfo( + storedAs: Option[String] = None, + formatClasses: Option[FormatClasses] = None, + serde: Option[String] = None, + serdeProperties: Map[String, String] = Map.empty) { + // this uses assertions because validation is done in validateRowFormatFileFormat etc. + assert(storedAs.isEmpty || formatClasses.isEmpty, + "Cannot specify both STORED AS and INPUTFORMAT/OUTPUTFORMAT") + + def describe: String = { + val serdeString = if (serde.isDefined || serdeProperties.nonEmpty) { + "ROW FORMAT " + serde.map(sd => s"SERDE $sd").getOrElse("DELIMITED") + } else { + "" + } + + this match { + case SerdeInfo(Some(storedAs), _, _, _) => + s"STORED AS $storedAs $serdeString" + case SerdeInfo(_, Some(formatClasses), _, _) => + s"STORED AS $formatClasses $serdeString" + case _ => + serdeString + } + } + + def merge(other: SerdeInfo): SerdeInfo = { + def getOnly[T](desc: String, left: Option[T], right: Option[T]): Option[T] = { + (left, right) match { + case (Some(l), Some(r)) => + assert(l == r, s"Conflicting $desc values: $l != $r") + left + case (Some(_), _) => + left + case (_, Some(_)) => + right + case _ => + None + } + } + + SerdeInfo.checkSerdePropMerging(serdeProperties, other.serdeProperties) + SerdeInfo( + getOnly("STORED AS", storedAs, other.storedAs), + getOnly("INPUTFORMAT/OUTPUTFORMAT", formatClasses, other.formatClasses), + getOnly("SERDE", serde, other.serde), + serdeProperties ++ other.serdeProperties) + } +} + +case class FormatClasses(input: String, output: String) { + override def toString: String = s"INPUTFORMAT $input OUTPUTFORMAT $output" +} + +object SerdeInfo { + val empty: SerdeInfo = SerdeInfo(None, None, None, Map.empty) + + def checkSerdePropMerging( + props1: Map[String, String], props2: Map[String, String]): Unit = { + val conflictKeys = props1.keySet.intersect(props2.keySet) + if (conflictKeys.nonEmpty) { + throw new UnsupportedOperationException( + s""" + |Cannot safely merge SERDEPROPERTIES: + |${props1.map { case (k, v) => s"$k=$v" }.mkString("{", ",", "}")} + |${props2.map { case (k, v) => s"$k=$v" }.mkString("{", ",", "}")} + |The conflict keys: ${conflictKeys.mkString(", ")} + |""".stripMargin) + } + } +} + /** * A CREATE TABLE command, as parsed from SQL. * @@ -68,6 +143,8 @@ case class CreateTableStatement( options: Map[String, String], location: Option[String], comment: Option[String], + serde: Option[SerdeInfo], + external: Boolean, ifNotExists: Boolean) extends ParsedStatement /** @@ -84,6 +161,8 @@ case class CreateTableAsSelectStatement( location: Option[String], comment: Option[String], writeOptions: Map[String, String], + serde: Option[SerdeInfo], + external: Boolean, ifNotExists: Boolean) extends ParsedStatement { override def children: Seq[LogicalPlan] = Seq(asSelect) @@ -119,6 +198,7 @@ case class ReplaceTableStatement( options: Map[String, String], location: Option[String], comment: Option[String], + serde: Option[SerdeInfo], orCreate: Boolean) extends ParsedStatement /** @@ -135,6 +215,7 @@ case class ReplaceTableAsSelectStatement( location: Option[String], comment: Option[String], writeOptions: Map[String, String], + serde: Option[SerdeInfo], orCreate: Boolean) extends ParsedStatement { override def children: Seq[LogicalPlan] = Seq(asSelect) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala index 1a3a7207c6ca9..b6dc4f61c8588 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala @@ -25,7 +25,7 @@ import scala.collection.JavaConverters._ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.{NamedRelation, NoSuchDatabaseException, NoSuchNamespaceException, NoSuchTableException, UnresolvedV2Relation} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser -import org.apache.spark.sql.catalyst.plans.logical.AlterTable +import org.apache.spark.sql.catalyst.plans.logical.{AlterTable, CreateTableAsSelectStatement, CreateTableStatement, ReplaceTableAsSelectStatement, ReplaceTableStatement, SerdeInfo} import org.apache.spark.sql.connector.catalog.TableChange._ import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.types.{ArrayType, DataType, HIVE_TYPE_STRING, HiveStringType, MapType, NullType, StructField, StructType} @@ -295,18 +295,65 @@ private[sql] object CatalogV2Util { catalog.name().equalsIgnoreCase(CatalogManager.SESSION_CATALOG_NAME) } - def convertTableProperties( + def convertTableProperties(c: CreateTableStatement): Map[String, String] = { + convertTableProperties( + c.properties, c.options, c.serde, c.location, c.comment, c.provider, c.external) + } + + def convertTableProperties(c: CreateTableAsSelectStatement): Map[String, String] = { + convertTableProperties( + c.properties, c.options, c.serde, c.location, c.comment, c.provider, c.external) + } + + def convertTableProperties(r: ReplaceTableStatement): Map[String, String] = { + convertTableProperties(r.properties, r.options, r.serde, r.location, r.comment, r.provider) + } + + def convertTableProperties(r: ReplaceTableAsSelectStatement): Map[String, String] = { + convertTableProperties(r.properties, r.options, r.serde, r.location, r.comment, r.provider) + } + + private def convertTableProperties( properties: Map[String, String], options: Map[String, String], + serdeInfo: Option[SerdeInfo], location: Option[String], comment: Option[String], - provider: Option[String]): Map[String, String] = { - properties ++ options ++ + provider: Option[String], + external: Boolean = false): Map[String, String] = { + properties ++ + options ++ // to make the transition to the "option." prefix easier, add both + options.map { case (key, value) => TableCatalog.OPTION_PREFIX + key -> value } ++ + convertToProperties(serdeInfo) ++ + (if (external) Some(TableCatalog.PROP_EXTERNAL -> "true") else None) ++ provider.map(TableCatalog.PROP_PROVIDER -> _) ++ comment.map(TableCatalog.PROP_COMMENT -> _) ++ location.map(TableCatalog.PROP_LOCATION -> _) } + /** + * Converts Hive Serde info to table properties. The mapped property keys are: + * - INPUTFORMAT/OUTPUTFORMAT: hive.input/output-format + * - STORED AS: hive.stored-as + * - ROW FORMAT SERDE: hive.serde + * - SERDEPROPERTIES: add "option." prefix + */ + private def convertToProperties(serdeInfo: Option[SerdeInfo]): Map[String, String] = { + serdeInfo match { + case Some(s) => + s.formatClasses.map { f => + Map("hive.input-format" -> f.input, "hive.output-format" -> f.output) + }.getOrElse(Map.empty) ++ + s.storedAs.map("hive.stored-as" -> _) ++ + s.serde.map("hive.serde" -> _) ++ + s.serdeProperties.map { + case (key, value) => TableCatalog.OPTION_PREFIX + key -> value + } + case None => + Map.empty + } + } + def withDefaultOwnership(properties: Map[String, String]): Map[String, String] = { properties ++ Map(TableCatalog.PROP_OWNER -> Utils.getCurrentUserName()) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala index cc3c824befb3e..f650922e75f6e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala @@ -63,6 +63,7 @@ class DDLParserSuite extends AnalysisTest { Some("parquet"), Map.empty[String, String], None, + None, None) Seq(createSql, replaceSql).foreach { sql => @@ -70,7 +71,7 @@ class DDLParserSuite extends AnalysisTest { } intercept("CREATE TABLE my_tab(a: INT COMMENT 'test', b: STRING) USING parquet", - "no viable alternative at input") + "extraneous input ':'") } test("create/replace table - with IF NOT EXISTS") { @@ -86,6 +87,7 @@ class DDLParserSuite extends AnalysisTest { Some("parquet"), Map.empty[String, String], None, + None, None), expectedIfNotExists = true) } @@ -106,6 +108,7 @@ class DDLParserSuite extends AnalysisTest { Some("parquet"), Map.empty[String, String], None, + None, None) Seq(createSql, replaceSql).foreach { sql => testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false) @@ -160,6 +163,7 @@ class DDLParserSuite extends AnalysisTest { Some("parquet"), Map.empty[String, String], None, + None, None) Seq(createSql, replaceSql).foreach { sql => testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false) @@ -182,6 +186,7 @@ class DDLParserSuite extends AnalysisTest { Some("parquet"), Map.empty[String, String], None, + None, None) Seq(createSql, replaceSql).foreach { sql => testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false) @@ -200,7 +205,8 @@ class DDLParserSuite extends AnalysisTest { Some("parquet"), Map.empty[String, String], None, - Some("abc")) + Some("abc"), + None) Seq(createSql, replaceSql).foreach{ sql => testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false) } @@ -220,6 +226,7 @@ class DDLParserSuite extends AnalysisTest { Some("parquet"), Map.empty[String, String], None, + None, None) Seq(createSql, replaceSql).foreach { sql => testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false) @@ -238,6 +245,7 @@ class DDLParserSuite extends AnalysisTest { Some("parquet"), Map.empty[String, String], Some("/tmp/file"), + None, None) Seq(createSql, replaceSql).foreach { sql => testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false) @@ -256,19 +264,309 @@ class DDLParserSuite extends AnalysisTest { Some("parquet"), Map.empty[String, String], None, + None, None) Seq(createSql, replaceSql).foreach { sql => testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false) } } + test("create/replace table - partition column definitions") { + val createSql = "CREATE TABLE my_tab (id bigint) PARTITIONED BY (part string)" + val replaceSql = "REPLACE TABLE my_tab (id bigint) PARTITIONED BY (part string)" + val expectedTableSpec = TableSpec( + Seq("my_tab"), + Some(new StructType().add("id", LongType).add("part", StringType)), + Seq(IdentityTransform(FieldReference("part"))), + None, + Map.empty[String, String], + None, + Map.empty[String, String], + None, + None, + None) + Seq(createSql, replaceSql).foreach { sql => + testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false) + } + } + + test("create/replace table - empty columns list") { + val createSql = "CREATE TABLE my_tab PARTITIONED BY (part string)" + val replaceSql = "REPLACE TABLE my_tab PARTITIONED BY (part string)" + val expectedTableSpec = TableSpec( + Seq("my_tab"), + Some(new StructType().add("part", StringType)), + Seq(IdentityTransform(FieldReference("part"))), + None, + Map.empty[String, String], + None, + Map.empty[String, String], + None, + None, + None) + Seq(createSql, replaceSql).foreach { sql => + testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false) + } + } + + test("create/replace table - using with partition column definitions") { + val createSql = "CREATE TABLE my_tab (id bigint) USING parquet PARTITIONED BY (part string)" + val replaceSql = "REPLACE TABLE my_tab (id bigint) USING parquet PARTITIONED BY (part string)" + val expectedTableSpec = TableSpec( + Seq("my_tab"), + Some(new StructType().add("id", LongType).add("part", StringType)), + Seq(IdentityTransform(FieldReference("part"))), + None, + Map.empty[String, String], + Some("parquet"), + Map.empty[String, String], + None, + None, + None) + Seq(createSql, replaceSql).foreach { sql => + testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false) + } + } + + test("create/replace table - mixed partition references and column definitions") { + val createSql = "CREATE TABLE my_tab (id bigint, p1 string) PARTITIONED BY (p1, p2 string)" + val replaceSql = createSql.replaceFirst("CREATE", "REPLACE") + Seq(createSql, replaceSql).foreach { sql => + assertUnsupported(sql, Seq( + "PARTITION BY: Cannot mix partition expressions and partition columns", + "Expressions: p1", + "Columns: p2 string")) + } + + val createSqlWithExpr = + "CREATE TABLE my_tab (id bigint, p1 string) PARTITIONED BY (p2 string, truncate(p1, 16))" + val replaceSqlWithExpr = createSqlWithExpr.replaceFirst("CREATE", "REPLACE") + Seq(createSqlWithExpr, replaceSqlWithExpr).foreach { sql => + assertUnsupported(sql, Seq( + "PARTITION BY: Cannot mix partition expressions and partition columns", + "Expressions: truncate(p1, 16)", + "Columns: p2 string")) + } + } + + test("create/replace table - stored as") { + val createSql = + """CREATE TABLE my_tab (id bigint) + |PARTITIONED BY (part string) + |STORED AS parquet + """.stripMargin + val replaceSql = createSql.replaceFirst("CREATE", "REPLACE") + val expectedTableSpec = TableSpec( + Seq("my_tab"), + Some(new StructType().add("id", LongType).add("part", StringType)), + Seq(IdentityTransform(FieldReference("part"))), + None, + Map.empty[String, String], + None, + Map.empty[String, String], + None, + None, + Some(SerdeInfo(storedAs = Some("parquet")))) + Seq(createSql, replaceSql).foreach { sql => + testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false) + } + } + + test("create/replace table - stored as format with serde") { + Seq("sequencefile", "textfile", "rcfile").foreach { format => + val createSql = + s"""CREATE TABLE my_tab (id bigint) + |PARTITIONED BY (part string) + |STORED AS $format + |ROW FORMAT SERDE 'customSerde' + |WITH SERDEPROPERTIES ('prop'='value') + """.stripMargin + val replaceSql = createSql.replaceFirst("CREATE", "REPLACE") + val expectedTableSpec = TableSpec( + Seq("my_tab"), + Some(new StructType().add("id", LongType).add("part", StringType)), + Seq(IdentityTransform(FieldReference("part"))), + None, + Map.empty[String, String], + None, + Map.empty[String, String], + None, + None, + Some(SerdeInfo(storedAs = Some(format), serde = Some("customSerde"), serdeProperties = Map( + "prop" -> "value" + )))) + Seq(createSql, replaceSql).foreach { sql => + testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false) + } + } + + val createSql = + s"""CREATE TABLE my_tab (id bigint) + |PARTITIONED BY (part string) + |STORED AS otherFormat + |ROW FORMAT SERDE 'customSerde' + |WITH SERDEPROPERTIES ('prop'='value') + """.stripMargin + val replaceSql = createSql.replaceFirst("CREATE", "REPLACE") + Seq(createSql, replaceSql).foreach { sql => + assertUnsupported(sql, Seq("ROW FORMAT SERDE is incompatible with format 'otherFormat'")) + } + } + + test("create/replace table - stored as format with delimited clauses") { + val createSql = + s"""CREATE TABLE my_tab (id bigint) + |PARTITIONED BY (part string) + |STORED AS textfile + |ROW FORMAT DELIMITED + |FIELDS TERMINATED BY ',' ESCAPED BY '\\\\' -- double escape for Scala and for SQL + |COLLECTION ITEMS TERMINATED BY '#' + |MAP KEYS TERMINATED BY '=' + |LINES TERMINATED BY '\\n' + """.stripMargin + val replaceSql = createSql.replaceFirst("CREATE", "REPLACE") + val expectedTableSpec = TableSpec( + Seq("my_tab"), + Some(new StructType().add("id", LongType).add("part", StringType)), + Seq(IdentityTransform(FieldReference("part"))), + None, + Map.empty[String, String], + None, + Map.empty[String, String], + None, + None, + Some(SerdeInfo(storedAs = Some("textfile"), serdeProperties = Map( + "field.delim" -> ",", "serialization.format" -> ",", "escape.delim" -> "\\", + "colelction.delim" -> "#", "mapkey.delim" -> "=", "line.delim" -> "\n" + )))) + Seq(createSql, replaceSql).foreach { sql => + testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false) + } + + val createFailSql = + s"""CREATE TABLE my_tab (id bigint) + |PARTITIONED BY (part string) + |STORED AS otherFormat + |ROW FORMAT DELIMITED + |FIELDS TERMINATED BY ',' + """.stripMargin + val replaceFailSql = createFailSql.replaceFirst("CREATE", "REPLACE") + Seq(createFailSql, replaceFailSql).foreach { sql => + assertUnsupported(sql, Seq( + "ROW FORMAT DELIMITED is only compatible with 'textfile', not 'otherFormat'")) + } + } + + test("create/replace table - stored as inputformat/outputformat") { + val createSql = + """CREATE TABLE my_tab (id bigint) + |PARTITIONED BY (part string) + |STORED AS INPUTFORMAT 'inFormat' OUTPUTFORMAT 'outFormat' + """.stripMargin + val replaceSql = createSql.replaceFirst("CREATE", "REPLACE") + val expectedTableSpec = TableSpec( + Seq("my_tab"), + Some(new StructType().add("id", LongType).add("part", StringType)), + Seq(IdentityTransform(FieldReference("part"))), + None, + Map.empty[String, String], + None, + Map.empty[String, String], + None, + None, + Some(SerdeInfo(formatClasses = Some(FormatClasses("inFormat", "outFormat"))))) + Seq(createSql, replaceSql).foreach { sql => + testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false) + } + } + + test("create/replace table - stored as inputformat/outputformat with serde") { + val createSql = + """CREATE TABLE my_tab (id bigint) + |PARTITIONED BY (part string) + |STORED AS INPUTFORMAT 'inFormat' OUTPUTFORMAT 'outFormat' + |ROW FORMAT SERDE 'customSerde' + """.stripMargin + val replaceSql = createSql.replaceFirst("CREATE", "REPLACE") + val expectedTableSpec = TableSpec( + Seq("my_tab"), + Some(new StructType().add("id", LongType).add("part", StringType)), + Seq(IdentityTransform(FieldReference("part"))), + None, + Map.empty[String, String], + None, + Map.empty[String, String], + None, + None, + Some(SerdeInfo( + formatClasses = Some(FormatClasses("inFormat", "outFormat")), + serde = Some("customSerde")))) + Seq(createSql, replaceSql).foreach { sql => + testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false) + } + } + + test("create/replace table - using with stored as") { + val createSql = + """CREATE TABLE my_tab (id bigint, part string) + |USING parquet + |STORED AS parquet + """.stripMargin + val replaceSql = createSql.replaceFirst("CREATE", "REPLACE") + Seq(createSql, replaceSql).foreach { sql => + assertUnsupported(sql, Seq("CREATE TABLE ... USING ... STORED AS")) + } + } + + test("create/replace table - using with row format serde") { + val createSql = + """CREATE TABLE my_tab (id bigint, part string) + |USING parquet + |ROW FORMAT SERDE 'customSerde' + """.stripMargin + val replaceSql = createSql.replaceFirst("CREATE", "REPLACE") + Seq(createSql, replaceSql).foreach { sql => + assertUnsupported(sql, Seq("CREATE TABLE ... USING ... ROW FORMAT SERDE")) + } + } + + test("create/replace table - using with row format delimited") { + val createSql = + """CREATE TABLE my_tab (id bigint, part string) + |USING parquet + |ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' + """.stripMargin + val replaceSql = createSql.replaceFirst("CREATE", "REPLACE") + Seq(createSql, replaceSql).foreach { sql => + assertUnsupported(sql, Seq("CREATE TABLE ... USING ... ROW FORMAT DELIMITED")) + } + } + + test("create/replace table - stored by") { + val createSql = + """CREATE TABLE my_tab (id bigint, p1 string) + |STORED BY 'handler' + """.stripMargin + val replaceSql = createSql.replaceFirst("CREATE", "REPLACE") + Seq(createSql, replaceSql).foreach { sql => + assertUnsupported(sql, Seq("stored by")) + } + } + + test("Unsupported skew clause - create/replace table") { + intercept("CREATE TABLE my_tab (id bigint) SKEWED BY (id) ON (1,2,3)", + "CREATE TABLE ... SKEWED BY") + intercept("REPLACE TABLE my_tab (id bigint) SKEWED BY (id) ON (1,2,3)", + "CREATE TABLE ... SKEWED BY") + } + test("Duplicate clauses - create/replace table") { def createTableHeader(duplicateClause: String): String = { - s"CREATE TABLE my_tab(a INT, b STRING) USING parquet $duplicateClause $duplicateClause" + s"CREATE TABLE my_tab(a INT, b STRING) $duplicateClause $duplicateClause" } def replaceTableHeader(duplicateClause: String): String = { - s"CREATE TABLE my_tab(a INT, b STRING) USING parquet $duplicateClause $duplicateClause" + s"CREATE TABLE my_tab(a INT, b STRING) $duplicateClause $duplicateClause" } intercept(createTableHeader("TBLPROPERTIES('test' = 'test2')"), @@ -281,6 +579,14 @@ class DDLParserSuite extends AnalysisTest { "Found duplicate clauses: CLUSTERED BY") intercept(createTableHeader("PARTITIONED BY (b)"), "Found duplicate clauses: PARTITIONED BY") + intercept(createTableHeader("PARTITIONED BY (c int)"), + "Found duplicate clauses: PARTITIONED BY") + intercept(createTableHeader("STORED AS parquet"), + "Found duplicate clauses: STORED AS") + intercept(createTableHeader("STORED AS INPUTFORMAT 'in' OUTPUTFORMAT 'out'"), + "Found duplicate clauses: STORED AS") + intercept(createTableHeader("ROW FORMAT SERDE 'serde'"), + "Found duplicate clauses: ROW FORMAT") intercept(replaceTableHeader("TBLPROPERTIES('test' = 'test2')"), "Found duplicate clauses: TBLPROPERTIES") @@ -292,6 +598,14 @@ class DDLParserSuite extends AnalysisTest { "Found duplicate clauses: CLUSTERED BY") intercept(replaceTableHeader("PARTITIONED BY (b)"), "Found duplicate clauses: PARTITIONED BY") + intercept(replaceTableHeader("PARTITIONED BY (c int)"), + "Found duplicate clauses: PARTITIONED BY") + intercept(replaceTableHeader("STORED AS parquet"), + "Found duplicate clauses: STORED AS") + intercept(replaceTableHeader("STORED AS INPUTFORMAT 'in' OUTPUTFORMAT 'out'"), + "Found duplicate clauses: STORED AS") + intercept(replaceTableHeader("ROW FORMAT SERDE 'serde'"), + "Found duplicate clauses: ROW FORMAT") } test("support for other types in OPTIONS") { @@ -317,6 +631,7 @@ class DDLParserSuite extends AnalysisTest { Some("json"), Map("a" -> "1", "b" -> "0.1", "c" -> "true"), None, + None, None), expectedIfNotExists = false) } @@ -372,7 +687,8 @@ class DDLParserSuite extends AnalysisTest { Some("parquet"), Map.empty[String, String], Some("/user/external/page_view"), - Some("This is the staging page view table")) + Some("This is the staging page view table"), + None) Seq(s1, s2, s3, s4).foreach { sql => testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = true) } @@ -2105,7 +2421,9 @@ class DDLParserSuite extends AnalysisTest { provider: Option[String], options: Map[String, String], location: Option[String], - comment: Option[String]) + comment: Option[String], + serdeInfo: Option[SerdeInfo], + external: Boolean = false) private object TableSpec { def apply(plan: LogicalPlan): TableSpec = { @@ -2120,7 +2438,9 @@ class DDLParserSuite extends AnalysisTest { create.provider, create.options, create.location, - create.comment) + create.comment, + create.serde, + create.external) case replace: ReplaceTableStatement => TableSpec( replace.tableName, @@ -2131,7 +2451,8 @@ class DDLParserSuite extends AnalysisTest { replace.provider, replace.options, replace.location, - replace.comment) + replace.comment, + replace.serde) case ctas: CreateTableAsSelectStatement => TableSpec( ctas.tableName, @@ -2142,7 +2463,9 @@ class DDLParserSuite extends AnalysisTest { ctas.provider, ctas.options, ctas.location, - ctas.comment) + ctas.comment, + ctas.serde, + ctas.external) case rtas: ReplaceTableAsSelectStatement => TableSpec( rtas.tableName, @@ -2153,7 +2476,8 @@ class DDLParserSuite extends AnalysisTest { rtas.provider, rtas.options, rtas.location, - rtas.comment) + rtas.comment, + rtas.serde) case other => fail(s"Expected to parse Create, CTAS, Replace, or RTAS plan" + s" from query, got ${other.getClass.getName}.") @@ -2179,8 +2503,7 @@ class DDLParserSuite extends AnalysisTest { CommentOnTable(UnresolvedTable(Seq("a", "b", "c"), "COMMENT ON TABLE"), "xYz")) } - // TODO: ignored by SPARK-31707, restore the test after create table syntax unification - ignore("create table - without using") { + test("create table - without using") { val sql = "CREATE TABLE 1m.2g(a INT)" val expectedTableSpec = TableSpec( Seq("1m", "2g"), @@ -2191,6 +2514,7 @@ class DDLParserSuite extends AnalysisTest { None, Map.empty[String, String], None, + None, None) testCreateOrReplaceDdl(sql, expectedTableSpec, expectedIfNotExists = false) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 31b4c158aa67b..a8688bdf15495 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -658,6 +658,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { extraOptions.get("path"), extraOptions.get(TableCatalog.PROP_COMMENT), extraOptions.toMap, + None, orCreate = true) // Create the table if it doesn't exist case (other, _) => @@ -675,7 +676,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { extraOptions.get("path"), extraOptions.get(TableCatalog.PROP_COMMENT), extraOptions.toMap, - ifNotExists = other == SaveMode.Ignore) + None, + ifNotExists = other == SaveMode.Ignore, + external = false) } runCommand(df.sparkSession, "saveAsTable") { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala index d55b5c3103537..9a49fc3d74780 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala @@ -119,7 +119,9 @@ final class DataFrameWriterV2[T] private[sql](table: String, ds: Dataset[T]) None, None, options.toMap, - ifNotExists = false) + None, + ifNotExists = false, + external = false) } } @@ -207,6 +209,7 @@ final class DataFrameWriterV2[T] private[sql](table: String, ds: Dataset[T]) None, None, options.toMap, + None, orCreate = orCreate) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index 395f5efd5a52d..f49caf7f04a20 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource} import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 +import org.apache.spark.sql.internal.HiveSerDe import org.apache.spark.sql.types.{HIVE_TYPE_STRING, HiveStringType, MetadataBuilder, StructField, StructType} /** @@ -265,16 +266,17 @@ class ResolveSessionCatalog( // For CREATE TABLE [AS SELECT], we should use the v1 command if the catalog is resolved to the // session catalog and the table provider is not v2. case c @ CreateTableStatement( - SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _) => + SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _, _, _) => assertNoNullTypeInSchema(c.tableSchema) - val provider = c.provider.getOrElse(conf.defaultDataSourceName) + val (storageFormat, provider) = getStorageFormatAndProvider( + c.provider, c.options, c.location, c.serde, ctas = false) if (!isV2Provider(provider)) { if (!DDLUtils.isHiveTable(Some(provider))) { assertNoCharTypeInSchema(c.tableSchema) } val tableDesc = buildCatalogTable(tbl.asTableIdentifier, c.tableSchema, - c.partitioning, c.bucketSpec, c.properties, provider, c.options, c.location, - c.comment, c.ifNotExists) + c.partitioning, c.bucketSpec, c.properties, provider, c.location, + c.comment, storageFormat, c.external) val mode = if (c.ifNotExists) SaveMode.Ignore else SaveMode.ErrorIfExists CreateTable(tableDesc, mode, None) } else { @@ -285,30 +287,32 @@ class ResolveSessionCatalog( c.tableSchema, // convert the bucket spec and add it as a transform c.partitioning ++ c.bucketSpec.map(_.asTransform), - convertTableProperties(c.properties, c.options, c.location, c.comment, Some(provider)), + convertTableProperties(c), ignoreIfExists = c.ifNotExists) } case c @ CreateTableAsSelectStatement( - SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _, _) => + SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _, _, _, _) => if (c.asSelect.resolved) { assertNoNullTypeInSchema(c.asSelect.schema) } - val provider = c.provider.getOrElse(conf.defaultDataSourceName) + val (storageFormat, provider) = getStorageFormatAndProvider( + c.provider, c.options, c.location, c.serde, ctas = true) if (!isV2Provider(provider)) { val tableDesc = buildCatalogTable(tbl.asTableIdentifier, new StructType, - c.partitioning, c.bucketSpec, c.properties, provider, c.options, c.location, - c.comment, c.ifNotExists) + c.partitioning, c.bucketSpec, c.properties, provider, c.location, + c.comment, storageFormat, c.external) val mode = if (c.ifNotExists) SaveMode.Ignore else SaveMode.ErrorIfExists CreateTable(tableDesc, mode, Some(c.asSelect)) } else { + assertNoCharTypeInSchema(c.schema) CreateTableAsSelect( catalog.asTableCatalog, tbl.asIdentifier, // convert the bucket spec and add it as a transform c.partitioning ++ c.bucketSpec.map(_.asTransform), c.asSelect, - convertTableProperties(c.properties, c.options, c.location, c.comment, Some(provider)), + convertTableProperties(c), writeOptions = c.writeOptions, ignoreIfExists = c.ifNotExists) } @@ -322,7 +326,7 @@ class ResolveSessionCatalog( // For REPLACE TABLE [AS SELECT], we should fail if the catalog is resolved to the // session catalog and the table provider is not v2. case c @ ReplaceTableStatement( - SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _) => + SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _, _) => assertNoNullTypeInSchema(c.tableSchema) val provider = c.provider.getOrElse(conf.defaultDataSourceName) if (!isV2Provider(provider)) { @@ -335,12 +339,12 @@ class ResolveSessionCatalog( c.tableSchema, // convert the bucket spec and add it as a transform c.partitioning ++ c.bucketSpec.map(_.asTransform), - convertTableProperties(c.properties, c.options, c.location, c.comment, Some(provider)), + convertTableProperties(c), orCreate = c.orCreate) } case c @ ReplaceTableAsSelectStatement( - SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _, _) => + SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _, _, _) => if (c.asSelect.resolved) { assertNoNullTypeInSchema(c.asSelect.schema) } @@ -354,7 +358,7 @@ class ResolveSessionCatalog( // convert the bucket spec and add it as a transform c.partitioning ++ c.bucketSpec.map(_.asTransform), c.asSelect, - convertTableProperties(c.properties, c.options, c.location, c.comment, Some(provider)), + convertTableProperties(c), writeOptions = c.writeOptions, orCreate = c.orCreate) } @@ -621,6 +625,64 @@ class ResolveSessionCatalog( case _ => throw new AnalysisException(s"$sql is only supported with temp views or v1 tables.") } + private def getStorageFormatAndProvider( + provider: Option[String], + options: Map[String, String], + location: Option[String], + maybeSerdeInfo: Option[SerdeInfo], + ctas: Boolean): (CatalogStorageFormat, String) = { + val nonHiveStorageFormat = CatalogStorageFormat.empty.copy( + locationUri = location.map(CatalogUtils.stringToURI), + properties = options) + val defaultHiveStorage = HiveSerDe.getDefaultStorage(conf).copy( + locationUri = location.map(CatalogUtils.stringToURI), + properties = options) + + if (provider.isDefined) { + // The parser guarantees that USING and STORED AS/ROW FORMAT won't co-exist. + if (maybeSerdeInfo.isDefined) { + throw new AnalysisException( + s"Cannot create table with both USING $provider and ${maybeSerdeInfo.get.describe}") + } + (nonHiveStorageFormat, provider.get) + } else if (maybeSerdeInfo.isDefined) { + val serdeInfo = maybeSerdeInfo.get + SerdeInfo.checkSerdePropMerging(serdeInfo.serdeProperties, defaultHiveStorage.properties) + val storageFormat = if (serdeInfo.storedAs.isDefined) { + // If `STORED AS fileFormat` is used, infer inputFormat, outputFormat and serde from it. + HiveSerDe.sourceToSerDe(serdeInfo.storedAs.get) match { + case Some(hiveSerde) => + defaultHiveStorage.copy( + inputFormat = hiveSerde.inputFormat.orElse(defaultHiveStorage.inputFormat), + outputFormat = hiveSerde.outputFormat.orElse(defaultHiveStorage.outputFormat), + // User specified serde takes precedence over the one inferred from file format. + serde = serdeInfo.serde.orElse(hiveSerde.serde).orElse(defaultHiveStorage.serde), + properties = serdeInfo.serdeProperties ++ defaultHiveStorage.properties) + case _ => throw new AnalysisException( + s"STORED AS with file format '${serdeInfo.storedAs.get}' is invalid.") + } + } else { + defaultHiveStorage.copy( + inputFormat = + serdeInfo.formatClasses.map(_.input).orElse(defaultHiveStorage.inputFormat), + outputFormat = + serdeInfo.formatClasses.map(_.output).orElse(defaultHiveStorage.outputFormat), + serde = serdeInfo.serde.orElse(defaultHiveStorage.serde), + properties = serdeInfo.serdeProperties ++ defaultHiveStorage.properties) + } + (storageFormat, DDLUtils.HIVE_PROVIDER) + } else { + // If neither USING nor STORED AS/ROW FORMAT is specified, we create native data source + // tables if it's a CTAS and `conf.convertCTAS` is true. + // TODO: create native data source table by default for non-CTAS. + if (ctas && conf.convertCTAS) { + (nonHiveStorageFormat, conf.defaultDataSourceName) + } else { + (defaultHiveStorage, DDLUtils.HIVE_PROVIDER) + } + } + } + private def buildCatalogTable( table: TableIdentifier, schema: StructType, @@ -628,13 +690,19 @@ class ResolveSessionCatalog( bucketSpec: Option[BucketSpec], properties: Map[String, String], provider: String, - options: Map[String, String], location: Option[String], comment: Option[String], - ifNotExists: Boolean): CatalogTable = { - val storage = CatalogStorageFormat.empty.copy( - locationUri = location.map(CatalogUtils.stringToURI), - properties = options) + storageFormat: CatalogStorageFormat, + external: Boolean): CatalogTable = { + if (external) { + if (DDLUtils.isHiveTable(Some(provider))) { + if (location.isEmpty) { + throw new AnalysisException(s"CREATE EXTERNAL TABLE must be accompanied by LOCATION") + } + } else { + throw new AnalysisException(s"Operation not allowed: CREATE EXTERNAL TABLE ... USING") + } + } val tableType = if (location.isDefined) { CatalogTableType.EXTERNAL @@ -645,7 +713,7 @@ class ResolveSessionCatalog( CatalogTable( identifier = table, tableType = tableType, - storage = storage, + storage = storageFormat, schema = schema, provider = Some(provider), partitionColumnNames = partitioning.asPartitionColumns, @@ -717,6 +785,9 @@ class ResolveSessionCatalog( } private def isV2Provider(provider: String): Boolean = { + // Return earlier since `lookupDataSourceV2` may fail to resolve provider "hive" to + // `HiveFileFormat`, when running tests in sql/core. + if (DDLUtils.isHiveTable(Some(provider))) return false DataSource.lookupDataSourceV2(provider, conf) match { // TODO(SPARK-28396): Currently file source v2 can't work with tables. case Some(_: FileDataSourceV2) => false diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 01522257c072d..a92f0775f1c05 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -26,7 +26,6 @@ import scala.collection.JavaConverters._ import org.antlr.v4.runtime.{ParserRuleContext, Token} import org.antlr.v4.runtime.tree.TerminalNode -import org.apache.spark.sql.SaveMode import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions.Expression @@ -37,7 +36,6 @@ import org.apache.spark.sql.catalyst.util.DateTimeConstants import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.internal.{HiveSerDe, SQLConf, VariableSubstitution} -import org.apache.spark.sql.types.StructType /** * Concrete parser for Spark SQL statements. @@ -279,7 +277,7 @@ class SparkSqlAstBuilder extends AstBuilder { operationNotAllowed("CREATE TEMPORARY TABLE IF NOT EXISTS", ctx) } - val (_, _, _, options, location, _) = visitCreateTableClauses(ctx.createTableClauses()) + val (_, _, _, _, options, location, _, _) = visitCreateTableClauses(ctx.createTableClauses()) val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText).getOrElse( throw new ParseException("CREATE TEMPORARY TABLE without a provider is not allowed.", ctx)) val schema = Option(ctx.colTypeList()).map(createSchema) @@ -382,153 +380,34 @@ class SparkSqlAstBuilder extends AstBuilder { } } - /** - * Create a Hive serde table, returning a [[CreateTable]] logical plan. - * - * This is a legacy syntax for Hive compatibility, we recommend users to use the Spark SQL - * CREATE TABLE syntax to create Hive serde table, e.g. "CREATE TABLE ... USING hive ..." - * - * Note: several features are currently not supported - temporary tables, bucketing, - * skewed columns and storage handlers (STORED BY). - * - * Expected format: - * {{{ - * CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name - * [(col1[:] data_type [COMMENT col_comment], ...)] - * create_table_clauses - * [AS select_statement]; - * - * create_table_clauses (order insensitive): - * [COMMENT table_comment] - * [PARTITIONED BY (col2[:] data_type [COMMENT col_comment], ...)] - * [ROW FORMAT row_format] - * [STORED AS file_format] - * [LOCATION path] - * [TBLPROPERTIES (property_name=property_value, ...)] - * }}} - */ - override def visitCreateHiveTable(ctx: CreateHiveTableContext): LogicalPlan = withOrigin(ctx) { - val (ident, temp, ifNotExists, external) = visitCreateTableHeader(ctx.createTableHeader) - // TODO: implement temporary tables - if (temp) { - throw new ParseException( - "CREATE TEMPORARY TABLE is not supported yet. " + - "Please use CREATE TEMPORARY VIEW as an alternative.", ctx) - } - if (ctx.skewSpec.size > 0) { - operationNotAllowed("CREATE TABLE ... SKEWED BY", ctx) - } - - checkDuplicateClauses(ctx.TBLPROPERTIES, "TBLPROPERTIES", ctx) - checkDuplicateClauses(ctx.PARTITIONED, "PARTITIONED BY", ctx) - checkDuplicateClauses(ctx.commentSpec(), "COMMENT", ctx) - checkDuplicateClauses(ctx.bucketSpec(), "CLUSTERED BY", ctx) - checkDuplicateClauses(ctx.createFileFormat, "STORED AS/BY", ctx) - checkDuplicateClauses(ctx.rowFormat, "ROW FORMAT", ctx) - checkDuplicateClauses(ctx.locationSpec, "LOCATION", ctx) - - val dataCols = Option(ctx.columns).map(visitColTypeList).getOrElse(Nil) - val partitionCols = Option(ctx.partitionColumns).map(visitColTypeList).getOrElse(Nil) - val properties = Option(ctx.tableProps).map(visitPropertyKeyValues).getOrElse(Map.empty) - val selectQuery = Option(ctx.query).map(plan) - val bucketSpec = ctx.bucketSpec().asScala.headOption.map(visitBucketSpec) - - // Note: Hive requires partition columns to be distinct from the schema, so we need - // to include the partition columns here explicitly - val schema = StructType(dataCols ++ partitionCols) - - // Storage format - val defaultStorage = HiveSerDe.getDefaultStorage(conf) - validateRowFormatFileFormat( - ctx.rowFormat.asScala.toSeq, ctx.createFileFormat.asScala.toSeq, ctx) - val fileStorage = ctx.createFileFormat.asScala.headOption.map(visitCreateFileFormat) - .getOrElse(CatalogStorageFormat.empty) - val rowStorage = ctx.rowFormat.asScala.headOption.map(visitRowFormat) - .getOrElse(CatalogStorageFormat.empty) - val location = visitLocationSpecList(ctx.locationSpec()) - // If we are creating an EXTERNAL table, then the LOCATION field is required - if (external && location.isEmpty) { - operationNotAllowed("CREATE EXTERNAL TABLE must be accompanied by LOCATION", ctx) - } - - val locUri = location.map(CatalogUtils.stringToURI(_)) - val storage = CatalogStorageFormat( - locationUri = locUri, - inputFormat = fileStorage.inputFormat.orElse(defaultStorage.inputFormat), - outputFormat = fileStorage.outputFormat.orElse(defaultStorage.outputFormat), - serde = rowStorage.serde.orElse(fileStorage.serde).orElse(defaultStorage.serde), - compressed = false, - properties = rowStorage.properties ++ fileStorage.properties) - // If location is defined, we'll assume this is an external table. - // Otherwise, we may accidentally delete existing data. - val tableType = if (external || location.isDefined) { - CatalogTableType.EXTERNAL + private def toStorageFormat( + location: Option[String], + maybeSerdeInfo: Option[SerdeInfo], + ctx: ParserRuleContext): CatalogStorageFormat = { + if (maybeSerdeInfo.isEmpty) { + CatalogStorageFormat.empty.copy(locationUri = location.map(CatalogUtils.stringToURI)) } else { - CatalogTableType.MANAGED - } - - val name = tableIdentifier(ident, "CREATE TABLE ... STORED AS ...", ctx) - - // TODO support the sql text - have a proper location for this! - val tableDesc = CatalogTable( - identifier = name, - tableType = tableType, - storage = storage, - schema = schema, - bucketSpec = bucketSpec, - provider = Some(DDLUtils.HIVE_PROVIDER), - partitionColumnNames = partitionCols.map(_.name), - properties = properties, - comment = visitCommentSpecList(ctx.commentSpec())) - - val mode = if (ifNotExists) SaveMode.Ignore else SaveMode.ErrorIfExists - - selectQuery match { - case Some(q) => - // Don't allow explicit specification of schema for CTAS. - if (dataCols.nonEmpty) { - operationNotAllowed( - "Schema may not be specified in a Create Table As Select (CTAS) statement", - ctx) - } - - // When creating partitioned table with CTAS statement, we can't specify data type for the - // partition columns. - if (partitionCols.nonEmpty) { - val errorMessage = "Create Partitioned Table As Select cannot specify data type for " + - "the partition columns of the target table." - operationNotAllowed(errorMessage, ctx) - } - - // Hive CTAS supports dynamic partition by specifying partition column names. - val partitionColumnNames = - Option(ctx.partitionColumnNames) - .map(visitIdentifierList(_).toArray) - .getOrElse(Array.empty[String]) - - val tableDescWithPartitionColNames = - tableDesc.copy(partitionColumnNames = partitionColumnNames) - - val hasStorageProperties = (ctx.createFileFormat.size != 0) || (ctx.rowFormat.size != 0) - if (conf.convertCTAS && !hasStorageProperties) { - // At here, both rowStorage.serdeProperties and fileStorage.serdeProperties - // are empty Maps. - val newTableDesc = tableDescWithPartitionColNames.copy( - storage = CatalogStorageFormat.empty.copy(locationUri = locUri), - provider = Some(conf.defaultDataSourceName)) - CreateTable(newTableDesc, mode, Some(q)) - } else { - CreateTable(tableDescWithPartitionColNames, mode, Some(q)) - } - case None => - // When creating partitioned table, we must specify data type for the partition columns. - if (Option(ctx.partitionColumnNames).isDefined) { - val errorMessage = "Must specify a data type for each partition column while creating " + - "Hive partitioned table." - operationNotAllowed(errorMessage, ctx) + val serdeInfo = maybeSerdeInfo.get + if (serdeInfo.storedAs.isEmpty) { + CatalogStorageFormat.empty.copy( + locationUri = location.map(CatalogUtils.stringToURI), + inputFormat = serdeInfo.formatClasses.map(_.input), + outputFormat = serdeInfo.formatClasses.map(_.output), + serde = serdeInfo.serde, + properties = serdeInfo.serdeProperties) + } else { + HiveSerDe.sourceToSerDe(serdeInfo.storedAs.get) match { + case Some(hiveSerde) => + CatalogStorageFormat.empty.copy( + locationUri = location.map(CatalogUtils.stringToURI), + inputFormat = hiveSerde.inputFormat, + outputFormat = hiveSerde.outputFormat, + serde = serdeInfo.serde.orElse(hiveSerde.serde), + properties = serdeInfo.serdeProperties) + case _ => + operationNotAllowed(s"STORED AS with file format '${serdeInfo.storedAs.get}'", ctx) } - - CreateTable(tableDesc, mode, None) + } } } @@ -559,189 +438,27 @@ class SparkSqlAstBuilder extends AstBuilder { checkDuplicateClauses(ctx.TBLPROPERTIES, "TBLPROPERTIES", ctx) val provider = ctx.tableProvider.asScala.headOption.map(_.multipartIdentifier.getText) val location = visitLocationSpecList(ctx.locationSpec()) - // rowStorage used to determine CatalogStorageFormat.serde and - // CatalogStorageFormat.properties in STORED AS clause. - val rowStorage = ctx.rowFormat.asScala.headOption.map(visitRowFormat) - .getOrElse(CatalogStorageFormat.empty) - val fileFormat = ctx.createFileFormat.asScala.headOption.map(visitCreateFileFormat) match { - case Some(f) => - if (provider.isDefined) { - throw new ParseException("'STORED AS hiveFormats' and 'USING provider' " + - "should not be specified both", ctx) - } - f.copy( - locationUri = location.map(CatalogUtils.stringToURI), - serde = rowStorage.serde.orElse(f.serde), - properties = rowStorage.properties ++ f.properties) - case None => - if (rowStorage.serde.isDefined) { - throw new ParseException("'ROW FORMAT' must be used with 'STORED AS'", ctx) - } - CatalogStorageFormat.empty.copy(locationUri = location.map(CatalogUtils.stringToURI)) + // TODO: Do not skip serde check for CREATE TABLE LIKE. + val serdeInfo = getSerdeInfo( + ctx.rowFormat.asScala, ctx.createFileFormat.asScala, ctx, skipCheck = true) + if (provider.isDefined && serdeInfo.isDefined) { + operationNotAllowed(s"CREATE TABLE LIKE ... USING ... ${serdeInfo.get.describe}", ctx) } - val properties = Option(ctx.tableProps).map(visitPropertyKeyValues).getOrElse(Map.empty) - CreateTableLikeCommand( - targetTable, sourceTable, fileFormat, provider, properties, ctx.EXISTS != null) - } - /** - * Create a [[CatalogStorageFormat]] for creating tables. - * - * Format: STORED AS ... - */ - override def visitCreateFileFormat( - ctx: CreateFileFormatContext): CatalogStorageFormat = withOrigin(ctx) { - (ctx.fileFormat, ctx.storageHandler) match { - // Expected format: INPUTFORMAT input_format OUTPUTFORMAT output_format - case (c: TableFileFormatContext, null) => - visitTableFileFormat(c) - // Expected format: SEQUENCEFILE | TEXTFILE | RCFILE | ORC | PARQUET | AVRO - case (c: GenericFileFormatContext, null) => - visitGenericFileFormat(c) - case (null, storageHandler) => - operationNotAllowed("STORED BY", ctx) - case _ => - throw new ParseException("Expected either STORED AS or STORED BY, not both", ctx) - } - } - - /** - * Create a [[CatalogStorageFormat]]. - */ - override def visitTableFileFormat( - ctx: TableFileFormatContext): CatalogStorageFormat = withOrigin(ctx) { - CatalogStorageFormat.empty.copy( - inputFormat = Option(string(ctx.inFmt)), - outputFormat = Option(string(ctx.outFmt))) - } - - /** - * Resolve a [[HiveSerDe]] based on the name given and return it as a [[CatalogStorageFormat]]. - */ - override def visitGenericFileFormat( - ctx: GenericFileFormatContext): CatalogStorageFormat = withOrigin(ctx) { - val source = ctx.identifier.getText - HiveSerDe.sourceToSerDe(source) match { - case Some(s) => - CatalogStorageFormat.empty.copy( - inputFormat = s.inputFormat, - outputFormat = s.outputFormat, - serde = s.serde) - case None => - operationNotAllowed(s"STORED AS with file format '$source'", ctx) - } - } - - /** - * Create a [[CatalogStorageFormat]] used for creating tables. - * - * Example format: - * {{{ - * SERDE serde_name [WITH SERDEPROPERTIES (k1=v1, k2=v2, ...)] - * }}} - * - * OR - * - * {{{ - * DELIMITED [FIELDS TERMINATED BY char [ESCAPED BY char]] - * [COLLECTION ITEMS TERMINATED BY char] - * [MAP KEYS TERMINATED BY char] - * [LINES TERMINATED BY char] - * [NULL DEFINED AS char] - * }}} - */ - private def visitRowFormat(ctx: RowFormatContext): CatalogStorageFormat = withOrigin(ctx) { - ctx match { - case serde: RowFormatSerdeContext => visitRowFormatSerde(serde) - case delimited: RowFormatDelimitedContext => visitRowFormatDelimited(delimited) - } - } - - /** - * Create SERDE row format name and properties pair. - */ - override def visitRowFormatSerde( - ctx: RowFormatSerdeContext): CatalogStorageFormat = withOrigin(ctx) { - import ctx._ - CatalogStorageFormat.empty.copy( - serde = Option(string(name)), - properties = Option(tablePropertyList).map(visitPropertyKeyValues).getOrElse(Map.empty)) - } - - /** - * Create a delimited row format properties object. - */ - override def visitRowFormatDelimited( - ctx: RowFormatDelimitedContext): CatalogStorageFormat = withOrigin(ctx) { - // TODO we need proper support for the NULL format. - val entries = - entry("field.delim", ctx.fieldsTerminatedBy) ++ - entry("serialization.format", ctx.fieldsTerminatedBy) ++ - entry("escape.delim", ctx.escapedBy) ++ - // The following typo is inherited from Hive... - entry("colelction.delim", ctx.collectionItemsTerminatedBy) ++ - entry("mapkey.delim", ctx.keysTerminatedBy) ++ - Option(ctx.linesSeparatedBy).toSeq.map { token => - val value = string(token) - validate( - value == "\n", - s"LINES TERMINATED BY only supports newline '\\n' right now: $value", - ctx) - "line.delim" -> value - } - CatalogStorageFormat.empty.copy(properties = entries.toMap) - } - - /** - * Throw a [[ParseException]] if the user specified incompatible SerDes through ROW FORMAT - * and STORED AS. - * - * The following are allowed. Anything else is not: - * ROW FORMAT SERDE ... STORED AS [SEQUENCEFILE | RCFILE | TEXTFILE] - * ROW FORMAT DELIMITED ... STORED AS TEXTFILE - * ROW FORMAT ... STORED AS INPUTFORMAT ... OUTPUTFORMAT ... - */ - private def validateRowFormatFileFormat( - rowFormatCtx: RowFormatContext, - createFileFormatCtx: CreateFileFormatContext, - parentCtx: ParserRuleContext): Unit = { - if (rowFormatCtx == null || createFileFormatCtx == null) { - return - } - (rowFormatCtx, createFileFormatCtx.fileFormat) match { - case (_, ffTable: TableFileFormatContext) => // OK - case (rfSerde: RowFormatSerdeContext, ffGeneric: GenericFileFormatContext) => - ffGeneric.identifier.getText.toLowerCase(Locale.ROOT) match { - case ("sequencefile" | "textfile" | "rcfile") => // OK - case fmt => - operationNotAllowed( - s"ROW FORMAT SERDE is incompatible with format '$fmt', which also specifies a serde", - parentCtx) - } - case (rfDelimited: RowFormatDelimitedContext, ffGeneric: GenericFileFormatContext) => - ffGeneric.identifier.getText.toLowerCase(Locale.ROOT) match { - case "textfile" => // OK - case fmt => operationNotAllowed( - s"ROW FORMAT DELIMITED is only compatible with 'textfile', not '$fmt'", parentCtx) + // TODO: remove this restriction as it seems unnecessary. + serdeInfo match { + case Some(SerdeInfo(storedAs, formatClasses, serde, _)) => + if (storedAs.isEmpty && formatClasses.isEmpty && serde.isDefined) { + throw new ParseException("'ROW FORMAT' must be used with 'STORED AS'", ctx) } case _ => - // should never happen - def str(ctx: ParserRuleContext): String = { - (0 until ctx.getChildCount).map { i => ctx.getChild(i).getText }.mkString(" ") - } - operationNotAllowed( - s"Unexpected combination of ${str(rowFormatCtx)} and ${str(createFileFormatCtx)}", - parentCtx) } - } - private def validateRowFormatFileFormat( - rowFormatCtx: Seq[RowFormatContext], - createFileFormatCtx: Seq[CreateFileFormatContext], - parentCtx: ParserRuleContext): Unit = { - if (rowFormatCtx.size == 1 && createFileFormatCtx.size == 1) { - validateRowFormatFileFormat(rowFormatCtx.head, createFileFormatCtx.head, parentCtx) - } + // TODO: also look at `HiveSerDe.getDefaultStorage`. + val storage = toStorageFormat(location, serdeInfo, ctx) + val properties = Option(ctx.tableProps).map(visitPropertyKeyValues).getOrElse(Map.empty) + CreateTableLikeCommand( + targetTable, sourceTable, storage, provider, properties, ctx.EXISTS != null) } /** @@ -788,7 +505,7 @@ class SparkSqlAstBuilder extends AstBuilder { case c: RowFormatSerdeContext => // Use a serde format. - val CatalogStorageFormat(None, None, None, Some(name), _, props) = visitRowFormatSerde(c) + val SerdeInfo(None, None, Some(name), props) = visitRowFormatSerde(c) // SPARK-10310: Special cases LazySimpleSerDe val recordHandler = if (name == "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe") { @@ -896,28 +613,21 @@ class SparkSqlAstBuilder extends AstBuilder { */ override def visitInsertOverwriteHiveDir( ctx: InsertOverwriteHiveDirContext): InsertDirParams = withOrigin(ctx) { - validateRowFormatFileFormat(ctx.rowFormat, ctx.createFileFormat, ctx) - val rowStorage = Option(ctx.rowFormat).map(visitRowFormat) - .getOrElse(CatalogStorageFormat.empty) - val fileStorage = Option(ctx.createFileFormat).map(visitCreateFileFormat) - .getOrElse(CatalogStorageFormat.empty) - + val serdeInfo = getSerdeInfo( + Option(ctx.rowFormat).toSeq, Option(ctx.createFileFormat).toSeq, ctx) val path = string(ctx.path) // The path field is required if (path.isEmpty) { operationNotAllowed("INSERT OVERWRITE DIRECTORY must be accompanied by path", ctx) } - val defaultStorage = HiveSerDe.getDefaultStorage(conf) - - val storage = CatalogStorageFormat( - locationUri = Some(CatalogUtils.stringToURI(path)), - inputFormat = fileStorage.inputFormat.orElse(defaultStorage.inputFormat), - outputFormat = fileStorage.outputFormat.orElse(defaultStorage.outputFormat), - serde = rowStorage.serde.orElse(fileStorage.serde).orElse(defaultStorage.serde), - compressed = false, - properties = rowStorage.properties ++ fileStorage.properties) + val default = HiveSerDe.getDefaultStorage(conf) + val storage = toStorageFormat(Some(path), serdeInfo, ctx) + val finalStorage = storage.copy( + inputFormat = storage.inputFormat.orElse(default.inputFormat), + outputFormat = storage.outputFormat.orElse(default.outputFormat), + serde = storage.serde.orElse(default.serde)) - (ctx.LOCAL != null, storage, Some(DDLUtils.HIVE_PROVIDER)) + (ctx.LOCAL != null, finalStorage, Some(DDLUtils.HIVE_PROVIDER)) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala index 9ee145580ce6d..f330d6a8c99e2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala @@ -85,7 +85,7 @@ class V2SessionCatalog(catalog: SessionCatalog) val provider = properties.getOrDefault(TableCatalog.PROP_PROVIDER, conf.defaultDataSourceName) val tableProperties = properties.asScala val location = Option(properties.get(TableCatalog.PROP_LOCATION)) - val storage = DataSource.buildStorageFormatFromOptions(tableProperties.toMap) + val storage = DataSource.buildStorageFormatFromOptions(toOptions(tableProperties.toMap)) .copy(locationUri = location.map(CatalogUtils.stringToURI)) val tableType = if (location.isDefined) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED @@ -111,6 +111,12 @@ class V2SessionCatalog(catalog: SessionCatalog) loadTable(ident) } + private def toOptions(properties: Map[String, String]): Map[String, String] = { + properties.filterKeys(_.startsWith(TableCatalog.OPTION_PREFIX)).map { + case (key, value) => key.drop(TableCatalog.OPTION_PREFIX.length) -> value + } + } + override def alterTable( ident: Identifier, changes: TableChange*): Table = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index 222fa8ace4dca..f2b57f9442d09 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -432,7 +432,7 @@ class DataSourceV2SQLSuite intercept[Exception] { spark.sql("REPLACE TABLE testcat.table_name" + - s" USING foo OPTIONS (`${InMemoryTable.SIMULATE_FAILED_WRITE_OPTION}`=true)" + + s" USING foo TBLPROPERTIES (`${InMemoryTable.SIMULATE_FAILED_WRITE_OPTION}`=true)" + s" AS SELECT id FROM source") } @@ -465,7 +465,7 @@ class DataSourceV2SQLSuite intercept[Exception] { spark.sql("REPLACE TABLE testcat_atomic.table_name" + - s" USING foo OPTIONS (`${InMemoryTable.SIMULATE_FAILED_WRITE_OPTION}=true)" + + s" USING foo TBLPROPERTIES (`${InMemoryTable.SIMULATE_FAILED_WRITE_OPTION}=true)" + s" AS SELECT id FROM source") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala index f55fbc9809f71..61c16baedb7cc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala @@ -20,16 +20,14 @@ package org.apache.spark.sql.execution import scala.collection.JavaConverters._ import org.apache.spark.internal.config.ConfigEntry -import org.apache.spark.sql.SaveMode import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedAlias, UnresolvedAttribute, UnresolvedRelation, UnresolvedStar} -import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType} import org.apache.spark.sql.catalyst.expressions.{Ascending, AttributeReference, Concat, SortOrder} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.command._ -import org.apache.spark.sql.execution.datasources.{CreateTable, CreateTempViewUsing, RefreshResource} -import org.apache.spark.sql.internal.{HiveSerDe, StaticSQLConf} -import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType} +import org.apache.spark.sql.execution.datasources.{CreateTempViewUsing, RefreshResource} +import org.apache.spark.sql.internal.StaticSQLConf +import org.apache.spark.sql.types.StringType /** * Parser test cases for rules defined in [[SparkSqlParser]]. @@ -42,23 +40,8 @@ class SparkSqlParserSuite extends AnalysisTest { private lazy val parser = new SparkSqlParser() - /** - * Normalizes plans: - * - CreateTable the createTime in tableDesc will replaced by -1L. - */ - override def normalizePlan(plan: LogicalPlan): LogicalPlan = { - plan match { - case CreateTable(tableDesc, mode, query) => - val newTableDesc = tableDesc.copy(createTime = -1L) - CreateTable(newTableDesc, mode, query) - case _ => plan // Don't transform - } - } - private def assertEqual(sqlCommand: String, plan: LogicalPlan): Unit = { - val normalized1 = normalizePlan(parser.parsePlan(sqlCommand)) - val normalized2 = normalizePlan(plan) - comparePlans(normalized1, normalized2) + comparePlans(parser.parsePlan(sqlCommand), plan) } private def intercept(sqlCommand: String, messages: String*): Unit = @@ -210,110 +193,6 @@ class SparkSqlParserSuite extends AnalysisTest { Map("path" -> "/data/tmp/testspark1"))) } - private def createTableUsing( - table: String, - database: Option[String] = None, - tableType: CatalogTableType = CatalogTableType.MANAGED, - storage: CatalogStorageFormat = CatalogStorageFormat.empty, - schema: StructType = new StructType, - provider: Option[String] = Some("parquet"), - partitionColumnNames: Seq[String] = Seq.empty, - bucketSpec: Option[BucketSpec] = None, - mode: SaveMode = SaveMode.ErrorIfExists, - query: Option[LogicalPlan] = None): CreateTable = { - CreateTable( - CatalogTable( - identifier = TableIdentifier(table, database), - tableType = tableType, - storage = storage, - schema = schema, - provider = provider, - partitionColumnNames = partitionColumnNames, - bucketSpec = bucketSpec - ), mode, query - ) - } - - private def createTable( - table: String, - database: Option[String] = None, - tableType: CatalogTableType = CatalogTableType.MANAGED, - storage: CatalogStorageFormat = CatalogStorageFormat.empty.copy( - inputFormat = HiveSerDe.sourceToSerDe("textfile").get.inputFormat, - outputFormat = HiveSerDe.sourceToSerDe("textfile").get.outputFormat, - serde = Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")), - schema: StructType = new StructType, - provider: Option[String] = Some("hive"), - partitionColumnNames: Seq[String] = Seq.empty, - comment: Option[String] = None, - mode: SaveMode = SaveMode.ErrorIfExists, - query: Option[LogicalPlan] = None): CreateTable = { - CreateTable( - CatalogTable( - identifier = TableIdentifier(table, database), - tableType = tableType, - storage = storage, - schema = schema, - provider = provider, - partitionColumnNames = partitionColumnNames, - comment = comment - ), mode, query - ) - } - - test("create table - schema") { - assertEqual("CREATE TABLE my_tab(a INT COMMENT 'test', b STRING) STORED AS textfile", - createTable( - table = "my_tab", - schema = (new StructType) - .add("a", IntegerType, nullable = true, "test") - .add("b", StringType) - ) - ) - assertEqual("CREATE TABLE my_tab(a INT COMMENT 'test', b STRING) " + - "PARTITIONED BY (c INT, d STRING COMMENT 'test2')", - createTable( - table = "my_tab", - schema = (new StructType) - .add("a", IntegerType, nullable = true, "test") - .add("b", StringType) - .add("c", IntegerType) - .add("d", StringType, nullable = true, "test2"), - partitionColumnNames = Seq("c", "d") - ) - ) - assertEqual("CREATE TABLE my_tab(id BIGINT, nested STRUCT) " + - "STORED AS textfile", - createTable( - table = "my_tab", - schema = (new StructType) - .add("id", LongType) - .add("nested", (new StructType) - .add("col1", StringType) - .add("col2", IntegerType) - ) - ) - ) - // Partitioned by a StructType should be accepted by `SparkSqlParser` but will fail an analyze - // rule in `AnalyzeCreateTable`. - assertEqual("CREATE TABLE my_tab(a INT COMMENT 'test', b STRING) " + - "PARTITIONED BY (nested STRUCT)", - createTable( - table = "my_tab", - schema = (new StructType) - .add("a", IntegerType, nullable = true, "test") - .add("b", StringType) - .add("nested", (new StructType) - .add("col1", StringType) - .add("col2", IntegerType) - ), - partitionColumnNames = Seq("nested") - ) - ) - intercept("CREATE TABLE my_tab(a: INT COMMENT 'test', b: STRING)", - "no viable alternative at input") - } - test("describe query") { val query = "SELECT * FROM t" assertEqual("DESCRIBE QUERY " + query, DescribeQueryCommand(query, parser.parsePlan(query))) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala index 8ce4bcbadc223..96f9421e1d988 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala @@ -17,14 +17,10 @@ package org.apache.spark.sql.execution.command -import java.net.URI import java.util.Locale -import scala.reflect.{classTag, ClassTag} - -import org.apache.spark.sql.{AnalysisException, SaveMode} +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedAttribute} -import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans import org.apache.spark.sql.catalyst.dsl.plans.DslLogicalPlan @@ -32,10 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.JsonTuple import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.SparkSqlParser -import org.apache.spark.sql.execution.datasources.CreateTable -import org.apache.spark.sql.internal.HiveSerDe import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.sql.types.StructType class DDLParserSuite extends AnalysisTest with SharedSparkSession { private lazy val parser = new SparkSqlParser() @@ -50,159 +43,17 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { } } - private def intercept(sqlCommand: String, messages: String*): Unit = - interceptParseException(parser.parsePlan)(sqlCommand, messages: _*) - - private def parseAs[T: ClassTag](query: String): T = { - parser.parsePlan(query) match { - case t: T => t - case other => - fail(s"Expected to parse ${classTag[T].runtimeClass} from query," + - s"got ${other.getClass.getName}: $query") - } - } - private def compareTransformQuery(sql: String, expected: LogicalPlan): Unit = { val plan = parser.parsePlan(sql).asInstanceOf[ScriptTransformation].copy(ioschema = null) comparePlans(plan, expected, checkAnalysis = false) } - private def extractTableDesc(sql: String): (CatalogTable, Boolean) = { - parser.parsePlan(sql).collect { - case CreateTable(tableDesc, mode, _) => (tableDesc, mode == SaveMode.Ignore) - }.head - } - test("alter database - property values must be set") { assertUnsupported( sql = "ALTER DATABASE my_db SET DBPROPERTIES('key_without_value', 'key_with_value'='x')", containsThesePhrases = Seq("key_without_value")) } - test("create hive table - table file format") { - val allSources = Seq("parquet", "parquetfile", "orc", "orcfile", "avro", "avrofile", - "sequencefile", "rcfile", "textfile") - - allSources.foreach { s => - val query = s"CREATE TABLE my_tab STORED AS $s" - val ct = parseAs[CreateTable](query) - val hiveSerde = HiveSerDe.sourceToSerDe(s) - assert(hiveSerde.isDefined) - assert(ct.tableDesc.storage.serde == - hiveSerde.get.serde.orElse(Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))) - assert(ct.tableDesc.storage.inputFormat == hiveSerde.get.inputFormat) - assert(ct.tableDesc.storage.outputFormat == hiveSerde.get.outputFormat) - } - } - - test("create hive table - row format and table file format") { - val createTableStart = "CREATE TABLE my_tab ROW FORMAT" - val fileFormat = s"STORED AS INPUTFORMAT 'inputfmt' OUTPUTFORMAT 'outputfmt'" - val query1 = s"$createTableStart SERDE 'anything' $fileFormat" - val query2 = s"$createTableStart DELIMITED FIELDS TERMINATED BY ' ' $fileFormat" - - // No conflicting serdes here, OK - val parsed1 = parseAs[CreateTable](query1) - assert(parsed1.tableDesc.storage.serde == Some("anything")) - assert(parsed1.tableDesc.storage.inputFormat == Some("inputfmt")) - assert(parsed1.tableDesc.storage.outputFormat == Some("outputfmt")) - - val parsed2 = parseAs[CreateTable](query2) - assert(parsed2.tableDesc.storage.serde == - Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) - assert(parsed2.tableDesc.storage.inputFormat == Some("inputfmt")) - assert(parsed2.tableDesc.storage.outputFormat == Some("outputfmt")) - } - - test("create hive table - row format serde and generic file format") { - val allSources = Seq("parquet", "orc", "avro", "sequencefile", "rcfile", "textfile") - val supportedSources = Set("sequencefile", "rcfile", "textfile") - - allSources.foreach { s => - val query = s"CREATE TABLE my_tab ROW FORMAT SERDE 'anything' STORED AS $s" - if (supportedSources.contains(s)) { - val ct = parseAs[CreateTable](query) - val hiveSerde = HiveSerDe.sourceToSerDe(s) - assert(hiveSerde.isDefined) - assert(ct.tableDesc.storage.serde == Some("anything")) - assert(ct.tableDesc.storage.inputFormat == hiveSerde.get.inputFormat) - assert(ct.tableDesc.storage.outputFormat == hiveSerde.get.outputFormat) - } else { - assertUnsupported(query, Seq("row format serde", "incompatible", s)) - } - } - } - - test("create hive table - row format delimited and generic file format") { - val allSources = Seq("parquet", "orc", "avro", "sequencefile", "rcfile", "textfile") - val supportedSources = Set("textfile") - - allSources.foreach { s => - val query = s"CREATE TABLE my_tab ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS $s" - if (supportedSources.contains(s)) { - val ct = parseAs[CreateTable](query) - val hiveSerde = HiveSerDe.sourceToSerDe(s) - assert(hiveSerde.isDefined) - assert(ct.tableDesc.storage.serde == - hiveSerde.get.serde.orElse(Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))) - assert(ct.tableDesc.storage.inputFormat == hiveSerde.get.inputFormat) - assert(ct.tableDesc.storage.outputFormat == hiveSerde.get.outputFormat) - } else { - assertUnsupported(query, Seq("row format delimited", "only compatible with 'textfile'", s)) - } - } - } - - test("create hive external table - location must be specified") { - assertUnsupported( - sql = "CREATE EXTERNAL TABLE my_tab STORED AS parquet", - containsThesePhrases = Seq("create external table", "location")) - val query = "CREATE EXTERNAL TABLE my_tab STORED AS parquet LOCATION '/something/anything'" - val ct = parseAs[CreateTable](query) - assert(ct.tableDesc.tableType == CatalogTableType.EXTERNAL) - assert(ct.tableDesc.storage.locationUri == Some(new URI("/something/anything"))) - } - - test("create hive table - property values must be set") { - assertUnsupported( - sql = "CREATE TABLE my_tab STORED AS parquet " + - "TBLPROPERTIES('key_without_value', 'key_with_value'='x')", - containsThesePhrases = Seq("key_without_value")) - assertUnsupported( - sql = "CREATE TABLE my_tab ROW FORMAT SERDE 'serde' " + - "WITH SERDEPROPERTIES('key_without_value', 'key_with_value'='x')", - containsThesePhrases = Seq("key_without_value")) - } - - test("create hive table - location implies external") { - val query = "CREATE TABLE my_tab STORED AS parquet LOCATION '/something/anything'" - val ct = parseAs[CreateTable](query) - assert(ct.tableDesc.tableType == CatalogTableType.EXTERNAL) - assert(ct.tableDesc.storage.locationUri == Some(new URI("/something/anything"))) - } - - test("Duplicate clauses - create hive table") { - def createTableHeader(duplicateClause: String): String = { - s"CREATE TABLE my_tab(a INT, b STRING) STORED AS parquet $duplicateClause $duplicateClause" - } - - intercept(createTableHeader("TBLPROPERTIES('test' = 'test2')"), - "Found duplicate clauses: TBLPROPERTIES") - intercept(createTableHeader("LOCATION '/tmp/file'"), - "Found duplicate clauses: LOCATION") - intercept(createTableHeader("COMMENT 'a table'"), - "Found duplicate clauses: COMMENT") - intercept(createTableHeader("CLUSTERED BY(b) INTO 256 BUCKETS"), - "Found duplicate clauses: CLUSTERED BY") - intercept(createTableHeader("PARTITIONED BY (k int)"), - "Found duplicate clauses: PARTITIONED BY") - intercept(createTableHeader("STORED AS parquet"), - "Found duplicate clauses: STORED AS/BY") - intercept( - createTableHeader("ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'"), - "Found duplicate clauses: ROW FORMAT") - } - test("insert overwrite directory") { val v1 = "INSERT OVERWRITE DIRECTORY '/tmp/file' USING parquet SELECT 1 as a" parser.parsePlan(v1) match { @@ -359,180 +210,6 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { assert(e.contains("Found duplicate keys 'a'")) } - test("Test CTAS #1") { - val s1 = - """ - |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view - |COMMENT 'This is the staging page view table' - |STORED AS RCFILE - |LOCATION '/user/external/page_view' - |TBLPROPERTIES ('p1'='v1', 'p2'='v2') - |AS SELECT * FROM src - """.stripMargin - - val s2 = - """ - |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view - |STORED AS RCFILE - |COMMENT 'This is the staging page view table' - |TBLPROPERTIES ('p1'='v1', 'p2'='v2') - |LOCATION '/user/external/page_view' - |AS SELECT * FROM src - """.stripMargin - - val s3 = - """ - |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view - |TBLPROPERTIES ('p1'='v1', 'p2'='v2') - |LOCATION '/user/external/page_view' - |STORED AS RCFILE - |COMMENT 'This is the staging page view table' - |AS SELECT * FROM src - """.stripMargin - - checkParsing(s1) - checkParsing(s2) - checkParsing(s3) - - def checkParsing(sql: String): Unit = { - val (desc, exists) = extractTableDesc(sql) - assert(exists) - assert(desc.identifier.database == Some("mydb")) - assert(desc.identifier.table == "page_view") - assert(desc.tableType == CatalogTableType.EXTERNAL) - assert(desc.storage.locationUri == Some(new URI("/user/external/page_view"))) - assert(desc.schema.isEmpty) // will be populated later when the table is actually created - assert(desc.comment == Some("This is the staging page view table")) - // TODO will be SQLText - assert(desc.viewText.isEmpty) - assert(desc.viewCatalogAndNamespace.isEmpty) - assert(desc.viewQueryColumnNames.isEmpty) - assert(desc.partitionColumnNames.isEmpty) - assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat")) - assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileOutputFormat")) - assert(desc.storage.serde == - Some("org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe")) - assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2")) - } - } - - test("Test CTAS #2") { - val s1 = - """ - |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view - |COMMENT 'This is the staging page view table' - |ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe' - | STORED AS - | INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat' - | OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat' - |LOCATION '/user/external/page_view' - |TBLPROPERTIES ('p1'='v1', 'p2'='v2') - |AS SELECT * FROM src - """.stripMargin - - val s2 = - """ - |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view - |LOCATION '/user/external/page_view' - |TBLPROPERTIES ('p1'='v1', 'p2'='v2') - |ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe' - | STORED AS - | INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat' - | OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat' - |COMMENT 'This is the staging page view table' - |AS SELECT * FROM src - """.stripMargin - - checkParsing(s1) - checkParsing(s2) - - def checkParsing(sql: String): Unit = { - val (desc, exists) = extractTableDesc(sql) - assert(exists) - assert(desc.identifier.database == Some("mydb")) - assert(desc.identifier.table == "page_view") - assert(desc.tableType == CatalogTableType.EXTERNAL) - assert(desc.storage.locationUri == Some(new URI("/user/external/page_view"))) - assert(desc.schema.isEmpty) // will be populated later when the table is actually created - // TODO will be SQLText - assert(desc.comment == Some("This is the staging page view table")) - assert(desc.viewText.isEmpty) - assert(desc.viewCatalogAndNamespace.isEmpty) - assert(desc.viewQueryColumnNames.isEmpty) - assert(desc.partitionColumnNames.isEmpty) - assert(desc.storage.properties == Map()) - assert(desc.storage.inputFormat == Some("parquet.hive.DeprecatedParquetInputFormat")) - assert(desc.storage.outputFormat == Some("parquet.hive.DeprecatedParquetOutputFormat")) - assert(desc.storage.serde == Some("parquet.hive.serde.ParquetHiveSerDe")) - assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2")) - } - } - - test("Test CTAS #3") { - val s3 = """CREATE TABLE page_view AS SELECT * FROM src""" - val (desc, exists) = extractTableDesc(s3) - assert(exists == false) - assert(desc.identifier.database == None) - assert(desc.identifier.table == "page_view") - assert(desc.tableType == CatalogTableType.MANAGED) - assert(desc.storage.locationUri == None) - assert(desc.schema.isEmpty) - assert(desc.viewText == None) // TODO will be SQLText - assert(desc.viewQueryColumnNames.isEmpty) - assert(desc.storage.properties == Map()) - assert(desc.storage.inputFormat == Some("org.apache.hadoop.mapred.TextInputFormat")) - assert(desc.storage.outputFormat == - Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")) - assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) - assert(desc.properties == Map()) - } - - test("Test CTAS #4") { - val s4 = - """CREATE TABLE page_view - |STORED BY 'storage.handler.class.name' AS SELECT * FROM src""".stripMargin - intercept[AnalysisException] { - extractTableDesc(s4) - } - } - - test("Test CTAS #5") { - val s5 = """CREATE TABLE ctas2 - | ROW FORMAT SERDE "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe" - | WITH SERDEPROPERTIES("serde_p1"="p1","serde_p2"="p2") - | STORED AS RCFile - | TBLPROPERTIES("tbl_p1"="p11", "tbl_p2"="p22") - | AS - | SELECT key, value - | FROM src - | ORDER BY key, value""".stripMargin - val (desc, exists) = extractTableDesc(s5) - assert(exists == false) - assert(desc.identifier.database == None) - assert(desc.identifier.table == "ctas2") - assert(desc.tableType == CatalogTableType.MANAGED) - assert(desc.storage.locationUri == None) - assert(desc.schema.isEmpty) - assert(desc.viewText == None) // TODO will be SQLText - assert(desc.viewCatalogAndNamespace.isEmpty) - assert(desc.viewQueryColumnNames.isEmpty) - assert(desc.storage.properties == Map(("serde_p1" -> "p1"), ("serde_p2" -> "p2"))) - assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat")) - assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileOutputFormat")) - assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe")) - assert(desc.properties == Map(("tbl_p1" -> "p11"), ("tbl_p2" -> "p22"))) - } - - test("CTAS statement with a PARTITIONED BY clause is not allowed") { - assertUnsupported(s"CREATE TABLE ctas1 PARTITIONED BY (k int)" + - " AS SELECT key, value FROM (SELECT 1 as key, 2 as value) tmp") - } - - test("CTAS statement with schema") { - assertUnsupported(s"CREATE TABLE ctas1 (age INT, name STRING) AS SELECT * FROM src") - assertUnsupported(s"CREATE TABLE ctas1 (age INT, name STRING) AS SELECT 1, 'hello'") - } - test("unsupported operations") { intercept[ParseException] { parser.parsePlan( @@ -642,205 +319,6 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { """.stripMargin) } - test("create table - basic") { - val query = "CREATE TABLE my_table (id int, name string)" - val (desc, allowExisting) = extractTableDesc(query) - assert(!allowExisting) - assert(desc.identifier.database.isEmpty) - assert(desc.identifier.table == "my_table") - assert(desc.tableType == CatalogTableType.MANAGED) - assert(desc.schema == new StructType().add("id", "int").add("name", "string")) - assert(desc.partitionColumnNames.isEmpty) - assert(desc.bucketSpec.isEmpty) - assert(desc.viewText.isEmpty) - assert(desc.viewQueryColumnNames.isEmpty) - assert(desc.storage.locationUri.isEmpty) - assert(desc.storage.inputFormat == - Some("org.apache.hadoop.mapred.TextInputFormat")) - assert(desc.storage.outputFormat == - Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")) - assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) - assert(desc.storage.properties.isEmpty) - assert(desc.properties.isEmpty) - assert(desc.comment.isEmpty) - } - - test("create table - with database name") { - val query = "CREATE TABLE dbx.my_table (id int, name string)" - val (desc, _) = extractTableDesc(query) - assert(desc.identifier.database == Some("dbx")) - assert(desc.identifier.table == "my_table") - } - - test("create table - temporary") { - val query = "CREATE TEMPORARY TABLE tab1 (id int, name string)" - val e = intercept[ParseException] { parser.parsePlan(query) } - assert(e.message.contains("CREATE TEMPORARY TABLE is not supported yet")) - } - - test("create table - external") { - val query = "CREATE EXTERNAL TABLE tab1 (id int, name string) LOCATION '/path/to/nowhere'" - val (desc, _) = extractTableDesc(query) - assert(desc.tableType == CatalogTableType.EXTERNAL) - assert(desc.storage.locationUri == Some(new URI("/path/to/nowhere"))) - } - - test("create table - if not exists") { - val query = "CREATE TABLE IF NOT EXISTS tab1 (id int, name string)" - val (_, allowExisting) = extractTableDesc(query) - assert(allowExisting) - } - - test("create table - comment") { - val query = "CREATE TABLE my_table (id int, name string) COMMENT 'its hot as hell below'" - val (desc, _) = extractTableDesc(query) - assert(desc.comment == Some("its hot as hell below")) - } - - test("create table - partitioned columns") { - val query = "CREATE TABLE my_table (id int, name string) PARTITIONED BY (month int)" - val (desc, _) = extractTableDesc(query) - assert(desc.schema == new StructType() - .add("id", "int") - .add("name", "string") - .add("month", "int")) - assert(desc.partitionColumnNames == Seq("month")) - } - - test("create table - clustered by") { - val numBuckets = 10 - val bucketedColumn = "id" - val sortColumn = "id" - val baseQuery = - s""" - CREATE TABLE my_table ( - $bucketedColumn int, - name string) - CLUSTERED BY($bucketedColumn) - """ - - val query1 = s"$baseQuery INTO $numBuckets BUCKETS" - val (desc1, _) = extractTableDesc(query1) - assert(desc1.bucketSpec.isDefined) - val bucketSpec1 = desc1.bucketSpec.get - assert(bucketSpec1.numBuckets == numBuckets) - assert(bucketSpec1.bucketColumnNames.head.equals(bucketedColumn)) - assert(bucketSpec1.sortColumnNames.isEmpty) - - val query2 = s"$baseQuery SORTED BY($sortColumn) INTO $numBuckets BUCKETS" - val (desc2, _) = extractTableDesc(query2) - assert(desc2.bucketSpec.isDefined) - val bucketSpec2 = desc2.bucketSpec.get - assert(bucketSpec2.numBuckets == numBuckets) - assert(bucketSpec2.bucketColumnNames.head.equals(bucketedColumn)) - assert(bucketSpec2.sortColumnNames.head.equals(sortColumn)) - } - - test("create table(hive) - skewed by") { - val baseQuery = "CREATE TABLE my_table (id int, name string) SKEWED BY" - val query1 = s"$baseQuery(id) ON (1, 10, 100)" - val query2 = s"$baseQuery(id, name) ON ((1, 'x'), (2, 'y'), (3, 'z'))" - val query3 = s"$baseQuery(id, name) ON ((1, 'x'), (2, 'y'), (3, 'z')) STORED AS DIRECTORIES" - val e1 = intercept[ParseException] { parser.parsePlan(query1) } - val e2 = intercept[ParseException] { parser.parsePlan(query2) } - val e3 = intercept[ParseException] { parser.parsePlan(query3) } - assert(e1.getMessage.contains("Operation not allowed")) - assert(e2.getMessage.contains("Operation not allowed")) - assert(e3.getMessage.contains("Operation not allowed")) - } - - test("create table(hive) - row format") { - val baseQuery = "CREATE TABLE my_table (id int, name string) ROW FORMAT" - val query1 = s"$baseQuery SERDE 'org.apache.poof.serde.Baff'" - val query2 = s"$baseQuery SERDE 'org.apache.poof.serde.Baff' WITH SERDEPROPERTIES ('k1'='v1')" - val query3 = - s""" - |$baseQuery DELIMITED FIELDS TERMINATED BY 'x' ESCAPED BY 'y' - |COLLECTION ITEMS TERMINATED BY 'a' - |MAP KEYS TERMINATED BY 'b' - |LINES TERMINATED BY '\n' - |NULL DEFINED AS 'c' - """.stripMargin - val (desc1, _) = extractTableDesc(query1) - val (desc2, _) = extractTableDesc(query2) - val (desc3, _) = extractTableDesc(query3) - assert(desc1.storage.serde == Some("org.apache.poof.serde.Baff")) - assert(desc1.storage.properties.isEmpty) - assert(desc2.storage.serde == Some("org.apache.poof.serde.Baff")) - assert(desc2.storage.properties == Map("k1" -> "v1")) - assert(desc3.storage.properties == Map( - "field.delim" -> "x", - "escape.delim" -> "y", - "serialization.format" -> "x", - "line.delim" -> "\n", - "colelction.delim" -> "a", // yes, it's a typo from Hive :) - "mapkey.delim" -> "b")) - } - - test("create table(hive) - file format") { - val baseQuery = "CREATE TABLE my_table (id int, name string) STORED AS" - val query1 = s"$baseQuery INPUTFORMAT 'winput' OUTPUTFORMAT 'wowput'" - val query2 = s"$baseQuery ORC" - val (desc1, _) = extractTableDesc(query1) - val (desc2, _) = extractTableDesc(query2) - assert(desc1.storage.inputFormat == Some("winput")) - assert(desc1.storage.outputFormat == Some("wowput")) - assert(desc1.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) - assert(desc2.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat")) - assert(desc2.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat")) - assert(desc2.storage.serde == Some("org.apache.hadoop.hive.ql.io.orc.OrcSerde")) - } - - test("create table(hive) - storage handler") { - val baseQuery = "CREATE TABLE my_table (id int, name string) STORED BY" - val query1 = s"$baseQuery 'org.papachi.StorageHandler'" - val query2 = s"$baseQuery 'org.mamachi.StorageHandler' WITH SERDEPROPERTIES ('k1'='v1')" - val e1 = intercept[ParseException] { parser.parsePlan(query1) } - val e2 = intercept[ParseException] { parser.parsePlan(query2) } - assert(e1.getMessage.contains("Operation not allowed")) - assert(e2.getMessage.contains("Operation not allowed")) - } - - test("create table - properties") { - val query = "CREATE TABLE my_table (id int, name string) TBLPROPERTIES ('k1'='v1', 'k2'='v2')" - val (desc, _) = extractTableDesc(query) - assert(desc.properties == Map("k1" -> "v1", "k2" -> "v2")) - } - - test("create table(hive) - everything!") { - val query = - """ - |CREATE EXTERNAL TABLE IF NOT EXISTS dbx.my_table (id int, name string) - |COMMENT 'no comment' - |PARTITIONED BY (month int) - |ROW FORMAT SERDE 'org.apache.poof.serde.Baff' WITH SERDEPROPERTIES ('k1'='v1') - |STORED AS INPUTFORMAT 'winput' OUTPUTFORMAT 'wowput' - |LOCATION '/path/to/mercury' - |TBLPROPERTIES ('k1'='v1', 'k2'='v2') - """.stripMargin - val (desc, allowExisting) = extractTableDesc(query) - assert(allowExisting) - assert(desc.identifier.database == Some("dbx")) - assert(desc.identifier.table == "my_table") - assert(desc.tableType == CatalogTableType.EXTERNAL) - assert(desc.schema == new StructType() - .add("id", "int") - .add("name", "string") - .add("month", "int")) - assert(desc.partitionColumnNames == Seq("month")) - assert(desc.bucketSpec.isEmpty) - assert(desc.viewText.isEmpty) - assert(desc.viewCatalogAndNamespace.isEmpty) - assert(desc.viewQueryColumnNames.isEmpty) - assert(desc.storage.locationUri == Some(new URI("/path/to/mercury"))) - assert(desc.storage.inputFormat == Some("winput")) - assert(desc.storage.outputFormat == Some("wowput")) - assert(desc.storage.serde == Some("org.apache.poof.serde.Baff")) - assert(desc.storage.properties == Map("k1" -> "v1")) - assert(desc.properties == Map("k1" -> "v1", "k2" -> "v2")) - assert(desc.comment == Some("no comment")) - } - test("create table like") { val v1 = "CREATE TABLE table1 LIKE table2" val (target, source, fileFormat, provider, properties, exists) = diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala index fd1978c5137a5..92c114e116d0c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala @@ -29,14 +29,14 @@ import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, Analyzer, CTESubstitution, EmptyFunctionRegistry, NoSuchTableException, ResolveCatalogs, ResolvedTable, ResolveInlineTables, ResolveSessionCatalog, UnresolvedAttribute, UnresolvedRelation, UnresolvedSubqueryColumnAliases, UnresolvedV2Relation} import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType, InMemoryCatalog, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Expression, InSubquery, IntegerLiteral, ListQuery, StringLiteral} -import org.apache.spark.sql.catalyst.parser.CatalystSqlParser -import org.apache.spark.sql.catalyst.plans.logical.{AlterTable, Assignment, CreateTableAsSelect, CreateV2Table, DeleteAction, DeleteFromTable, DescribeRelation, DropTable, InsertAction, InsertIntoStatement, LocalRelation, LogicalPlan, MergeIntoTable, OneRowRelation, Project, ShowTableProperties, SubqueryAlias, UpdateAction, UpdateTable} +import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException} +import org.apache.spark.sql.catalyst.plans.logical.{AlterTable, Assignment, CreateTableAsSelect, CreateTableStatement, CreateV2Table, DeleteAction, DeleteFromTable, DescribeRelation, DropTable, InsertAction, InsertIntoStatement, LocalRelation, LogicalPlan, MergeIntoTable, OneRowRelation, Project, ShowTableProperties, SubqueryAlias, UpdateAction, UpdateTable} import org.apache.spark.sql.connector.FakeV2Provider import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogNotFoundException, Identifier, Table, TableCapability, TableCatalog, TableChange, V1Table} import org.apache.spark.sql.connector.catalog.TableChange.{UpdateColumnComment, UpdateColumnType} import org.apache.spark.sql.execution.datasources.CreateTable import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation -import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} import org.apache.spark.sql.sources.SimpleScanSource import org.apache.spark.sql.types.{CharType, DoubleType, HIVE_TYPE_STRING, IntegerType, LongType, MetadataBuilder, StringType, StructField, StructType} @@ -178,6 +178,16 @@ class PlanResolutionSuite extends AnalysisTest { }.head } + private def assertUnsupported(sql: String, containsThesePhrases: Seq[String] = Seq()): Unit = { + val e = intercept[ParseException] { + parsePlan(sql) + } + assert(e.getMessage.toLowerCase(Locale.ROOT).contains("operation not allowed")) + containsThesePhrases.foreach { p => + assert(e.getMessage.toLowerCase(Locale.ROOT).contains(p.toLowerCase(Locale.ROOT))) + } + } + test("create table - with partitioned by") { val query = "CREATE TABLE my_tab(a INT comment 'test', b STRING) " + "USING parquet PARTITIONED BY (a)" @@ -428,10 +438,11 @@ class PlanResolutionSuite extends AnalysisTest { val expectedProperties = Map( "p1" -> "v1", "p2" -> "v2", - "other" -> "20", + "option.other" -> "20", "provider" -> "parquet", "location" -> "s3://bucket/path/to/data", - "comment" -> "table comment") + "comment" -> "table comment", + "other" -> "20") parseAndResolve(sql) match { case create: CreateV2Table => @@ -467,10 +478,11 @@ class PlanResolutionSuite extends AnalysisTest { val expectedProperties = Map( "p1" -> "v1", "p2" -> "v2", - "other" -> "20", + "option.other" -> "20", "provider" -> "parquet", "location" -> "s3://bucket/path/to/data", - "comment" -> "table comment") + "comment" -> "table comment", + "other" -> "20") parseAndResolve(sql, withDefault = true) match { case create: CreateV2Table => @@ -542,10 +554,11 @@ class PlanResolutionSuite extends AnalysisTest { val expectedProperties = Map( "p1" -> "v1", "p2" -> "v2", - "other" -> "20", + "option.other" -> "20", "provider" -> "parquet", "location" -> "s3://bucket/path/to/data", - "comment" -> "table comment") + "comment" -> "table comment", + "other" -> "20") parseAndResolve(sql) match { case ctas: CreateTableAsSelect => @@ -576,10 +589,11 @@ class PlanResolutionSuite extends AnalysisTest { val expectedProperties = Map( "p1" -> "v1", "p2" -> "v2", - "other" -> "20", + "option.other" -> "20", "provider" -> "parquet", "location" -> "s3://bucket/path/to/data", - "comment" -> "table comment") + "comment" -> "table comment", + "other" -> "20") parseAndResolve(sql, withDefault = true) match { case ctas: CreateTableAsSelect => @@ -1557,6 +1571,630 @@ class PlanResolutionSuite extends AnalysisTest { checkFailure("testcat.tab", "foo") } + private def compareNormalized(plan1: LogicalPlan, plan2: LogicalPlan): Unit = { + /** + * Normalizes plans: + * - CreateTable the createTime in tableDesc will replaced by -1L. + */ + def normalizePlan(plan: LogicalPlan): LogicalPlan = { + plan match { + case CreateTable(tableDesc, mode, query) => + val newTableDesc = tableDesc.copy(createTime = -1L) + CreateTable(newTableDesc, mode, query) + case _ => plan // Don't transform + } + } + comparePlans(normalizePlan(plan1), normalizePlan(plan2)) + } + + test("create table - schema") { + def createTable( + table: String, + database: Option[String] = None, + tableType: CatalogTableType = CatalogTableType.MANAGED, + storage: CatalogStorageFormat = CatalogStorageFormat.empty.copy( + inputFormat = HiveSerDe.sourceToSerDe("textfile").get.inputFormat, + outputFormat = HiveSerDe.sourceToSerDe("textfile").get.outputFormat, + serde = Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")), + schema: StructType = new StructType, + provider: Option[String] = Some("hive"), + partitionColumnNames: Seq[String] = Seq.empty, + comment: Option[String] = None, + mode: SaveMode = SaveMode.ErrorIfExists, + query: Option[LogicalPlan] = None): CreateTable = { + CreateTable( + CatalogTable( + identifier = TableIdentifier(table, database), + tableType = tableType, + storage = storage, + schema = schema, + provider = provider, + partitionColumnNames = partitionColumnNames, + comment = comment + ), mode, query + ) + } + + def compare(sql: String, plan: LogicalPlan): Unit = { + compareNormalized(parseAndResolve(sql), plan) + } + + compare("CREATE TABLE my_tab(a INT COMMENT 'test', b STRING) STORED AS textfile", + createTable( + table = "my_tab", + database = Some("default"), + schema = (new StructType) + .add("a", IntegerType, nullable = true, "test") + .add("b", StringType) + ) + ) + compare("CREATE TABLE my_tab(a INT COMMENT 'test', b STRING) " + + "PARTITIONED BY (c INT, d STRING COMMENT 'test2')", + createTable( + table = "my_tab", + database = Some("default"), + schema = (new StructType) + .add("a", IntegerType, nullable = true, "test") + .add("b", StringType) + .add("c", IntegerType) + .add("d", StringType, nullable = true, "test2"), + partitionColumnNames = Seq("c", "d") + ) + ) + compare("CREATE TABLE my_tab(id BIGINT, nested STRUCT) " + + "STORED AS textfile", + createTable( + table = "my_tab", + database = Some("default"), + schema = (new StructType) + .add("id", LongType) + .add("nested", (new StructType) + .add("col1", StringType) + .add("col2", IntegerType) + ) + ) + ) + // Partitioned by a StructType should be accepted by `SparkSqlParser` but will fail an analyze + // rule in `AnalyzeCreateTable`. + compare("CREATE TABLE my_tab(a INT COMMENT 'test', b STRING) " + + "PARTITIONED BY (nested STRUCT)", + createTable( + table = "my_tab", + database = Some("default"), + schema = (new StructType) + .add("a", IntegerType, nullable = true, "test") + .add("b", StringType) + .add("nested", (new StructType) + .add("col1", StringType) + .add("col2", IntegerType) + ), + partitionColumnNames = Seq("nested") + ) + ) + + interceptParseException(parsePlan)( + "CREATE TABLE my_tab(a: INT COMMENT 'test', b: STRING)", + "extraneous input ':'") + } + + test("create hive table - table file format") { + val allSources = Seq("parquet", "parquetfile", "orc", "orcfile", "avro", "avrofile", + "sequencefile", "rcfile", "textfile") + + allSources.foreach { s => + val query = s"CREATE TABLE my_tab STORED AS $s" + parseAndResolve(query) match { + case ct: CreateTable => + val hiveSerde = HiveSerDe.sourceToSerDe(s) + assert(hiveSerde.isDefined) + assert(ct.tableDesc.storage.serde == + hiveSerde.get.serde.orElse(Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))) + assert(ct.tableDesc.storage.inputFormat == hiveSerde.get.inputFormat) + assert(ct.tableDesc.storage.outputFormat == hiveSerde.get.outputFormat) + } + } + } + + test("create hive table - row format and table file format") { + val createTableStart = "CREATE TABLE my_tab ROW FORMAT" + val fileFormat = s"STORED AS INPUTFORMAT 'inputfmt' OUTPUTFORMAT 'outputfmt'" + val query1 = s"$createTableStart SERDE 'anything' $fileFormat" + val query2 = s"$createTableStart DELIMITED FIELDS TERMINATED BY ' ' $fileFormat" + + // No conflicting serdes here, OK + parseAndResolve(query1) match { + case parsed1: CreateTable => + assert(parsed1.tableDesc.storage.serde == Some("anything")) + assert(parsed1.tableDesc.storage.inputFormat == Some("inputfmt")) + assert(parsed1.tableDesc.storage.outputFormat == Some("outputfmt")) + } + + parseAndResolve(query2) match { + case parsed2: CreateTable => + assert(parsed2.tableDesc.storage.serde == + Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) + assert(parsed2.tableDesc.storage.inputFormat == Some("inputfmt")) + assert(parsed2.tableDesc.storage.outputFormat == Some("outputfmt")) + } + } + + test("create hive table - row format serde and generic file format") { + val allSources = Seq("parquet", "orc", "avro", "sequencefile", "rcfile", "textfile") + val supportedSources = Set("sequencefile", "rcfile", "textfile") + + allSources.foreach { s => + val query = s"CREATE TABLE my_tab ROW FORMAT SERDE 'anything' STORED AS $s" + if (supportedSources.contains(s)) { + parseAndResolve(query) match { + case ct: CreateTable => + val hiveSerde = HiveSerDe.sourceToSerDe(s) + assert(hiveSerde.isDefined) + assert(ct.tableDesc.storage.serde == Some("anything")) + assert(ct.tableDesc.storage.inputFormat == hiveSerde.get.inputFormat) + assert(ct.tableDesc.storage.outputFormat == hiveSerde.get.outputFormat) + } + } else { + assertUnsupported(query, Seq("row format serde", "incompatible", s)) + } + } + } + + test("create hive table - row format delimited and generic file format") { + val allSources = Seq("parquet", "orc", "avro", "sequencefile", "rcfile", "textfile") + val supportedSources = Set("textfile") + + allSources.foreach { s => + val query = s"CREATE TABLE my_tab ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS $s" + if (supportedSources.contains(s)) { + parseAndResolve(query) match { + case ct: CreateTable => + val hiveSerde = HiveSerDe.sourceToSerDe(s) + assert(hiveSerde.isDefined) + assert(ct.tableDesc.storage.serde == hiveSerde.get.serde + .orElse(Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))) + assert(ct.tableDesc.storage.inputFormat == hiveSerde.get.inputFormat) + assert(ct.tableDesc.storage.outputFormat == hiveSerde.get.outputFormat) + } + } else { + assertUnsupported(query, Seq("row format delimited", "only compatible with 'textfile'", s)) + } + } + } + + test("create hive external table - location must be specified") { + val exc = intercept[AnalysisException] { + parseAndResolve("CREATE EXTERNAL TABLE my_tab STORED AS parquet") + } + assert(exc.getMessage.contains("CREATE EXTERNAL TABLE must be accompanied by LOCATION")) + + val query = "CREATE EXTERNAL TABLE my_tab STORED AS parquet LOCATION '/something/anything'" + parseAndResolve(query) match { + case ct: CreateTable => + assert(ct.tableDesc.tableType == CatalogTableType.EXTERNAL) + assert(ct.tableDesc.storage.locationUri == Some(new URI("/something/anything"))) + } + } + + test("create hive table - property values must be set") { + assertUnsupported( + sql = "CREATE TABLE my_tab STORED AS parquet " + + "TBLPROPERTIES('key_without_value', 'key_with_value'='x')", + containsThesePhrases = Seq("key_without_value")) + assertUnsupported( + sql = "CREATE TABLE my_tab ROW FORMAT SERDE 'serde' " + + "WITH SERDEPROPERTIES('key_without_value', 'key_with_value'='x')", + containsThesePhrases = Seq("key_without_value")) + } + + test("create hive table - location implies external") { + val query = "CREATE TABLE my_tab STORED AS parquet LOCATION '/something/anything'" + parseAndResolve(query) match { + case ct: CreateTable => + assert(ct.tableDesc.tableType == CatalogTableType.EXTERNAL) + assert(ct.tableDesc.storage.locationUri == Some(new URI("/something/anything"))) + } + } + + test("Duplicate clauses - create hive table") { + def intercept(sqlCommand: String, messages: String*): Unit = + interceptParseException(parsePlan)(sqlCommand, messages: _*) + + def createTableHeader(duplicateClause: String): String = { + s"CREATE TABLE my_tab(a INT, b STRING) STORED AS parquet $duplicateClause $duplicateClause" + } + + intercept(createTableHeader("TBLPROPERTIES('test' = 'test2')"), + "Found duplicate clauses: TBLPROPERTIES") + intercept(createTableHeader("LOCATION '/tmp/file'"), + "Found duplicate clauses: LOCATION") + intercept(createTableHeader("COMMENT 'a table'"), + "Found duplicate clauses: COMMENT") + intercept(createTableHeader("CLUSTERED BY(b) INTO 256 BUCKETS"), + "Found duplicate clauses: CLUSTERED BY") + intercept(createTableHeader("PARTITIONED BY (k int)"), + "Found duplicate clauses: PARTITIONED BY") + intercept(createTableHeader("STORED AS parquet"), + "Found duplicate clauses: STORED AS/BY") + intercept( + createTableHeader("ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'"), + "Found duplicate clauses: ROW FORMAT") + } + + test("Test CTAS #1") { + val s1 = + """ + |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view + |COMMENT 'This is the staging page view table' + |STORED AS RCFILE + |LOCATION '/user/external/page_view' + |TBLPROPERTIES ('p1'='v1', 'p2'='v2') + |AS SELECT * FROM src + """.stripMargin + + val s2 = + """ + |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view + |STORED AS RCFILE + |COMMENT 'This is the staging page view table' + |TBLPROPERTIES ('p1'='v1', 'p2'='v2') + |LOCATION '/user/external/page_view' + |AS SELECT * FROM src + """.stripMargin + + val s3 = + """ + |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view + |TBLPROPERTIES ('p1'='v1', 'p2'='v2') + |LOCATION '/user/external/page_view' + |STORED AS RCFILE + |COMMENT 'This is the staging page view table' + |AS SELECT * FROM src + """.stripMargin + + checkParsing(s1) + checkParsing(s2) + checkParsing(s3) + + def checkParsing(sql: String): Unit = { + val (desc, exists) = extractTableDesc(sql) + assert(exists) + assert(desc.identifier.database == Some("mydb")) + assert(desc.identifier.table == "page_view") + assert(desc.tableType == CatalogTableType.EXTERNAL) + assert(desc.storage.locationUri == Some(new URI("/user/external/page_view"))) + assert(desc.schema.isEmpty) // will be populated later when the table is actually created + assert(desc.comment == Some("This is the staging page view table")) + // TODO will be SQLText + assert(desc.viewText.isEmpty) + assert(desc.viewCatalogAndNamespace.isEmpty) + assert(desc.viewQueryColumnNames.isEmpty) + assert(desc.partitionColumnNames.isEmpty) + assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat")) + assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileOutputFormat")) + assert(desc.storage.serde == + Some("org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe")) + assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2")) + } + } + + test("Test CTAS #2") { + val s1 = + """ + |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view + |COMMENT 'This is the staging page view table' + |ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe' + | STORED AS + | INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat' + | OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat' + |LOCATION '/user/external/page_view' + |TBLPROPERTIES ('p1'='v1', 'p2'='v2') + |AS SELECT * FROM src + """.stripMargin + + val s2 = + """ + |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view + |LOCATION '/user/external/page_view' + |TBLPROPERTIES ('p1'='v1', 'p2'='v2') + |ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe' + | STORED AS + | INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat' + | OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat' + |COMMENT 'This is the staging page view table' + |AS SELECT * FROM src + """.stripMargin + + checkParsing(s1) + checkParsing(s2) + + def checkParsing(sql: String): Unit = { + val (desc, exists) = extractTableDesc(sql) + assert(exists) + assert(desc.identifier.database == Some("mydb")) + assert(desc.identifier.table == "page_view") + assert(desc.tableType == CatalogTableType.EXTERNAL) + assert(desc.storage.locationUri == Some(new URI("/user/external/page_view"))) + assert(desc.schema.isEmpty) // will be populated later when the table is actually created + // TODO will be SQLText + assert(desc.comment == Some("This is the staging page view table")) + assert(desc.viewText.isEmpty) + assert(desc.viewCatalogAndNamespace.isEmpty) + assert(desc.viewQueryColumnNames.isEmpty) + assert(desc.partitionColumnNames.isEmpty) + assert(desc.storage.properties == Map()) + assert(desc.storage.inputFormat == Some("parquet.hive.DeprecatedParquetInputFormat")) + assert(desc.storage.outputFormat == Some("parquet.hive.DeprecatedParquetOutputFormat")) + assert(desc.storage.serde == Some("parquet.hive.serde.ParquetHiveSerDe")) + assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2")) + } + } + + test("Test CTAS #3") { + val s3 = """CREATE TABLE page_view AS SELECT * FROM src""" + val (desc, exists) = extractTableDesc(s3) + assert(exists == false) + assert(desc.identifier.database == Some("default")) + assert(desc.identifier.table == "page_view") + assert(desc.tableType == CatalogTableType.MANAGED) + assert(desc.storage.locationUri == None) + assert(desc.schema.isEmpty) + assert(desc.viewText == None) // TODO will be SQLText + assert(desc.viewQueryColumnNames.isEmpty) + assert(desc.storage.properties == Map()) + assert(desc.storage.inputFormat == Some("org.apache.hadoop.mapred.TextInputFormat")) + assert(desc.storage.outputFormat == + Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")) + assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) + assert(desc.properties == Map()) + } + + test("Test CTAS #4") { + val s4 = + """CREATE TABLE page_view + |STORED BY 'storage.handler.class.name' AS SELECT * FROM src""".stripMargin + intercept[AnalysisException] { + extractTableDesc(s4) + } + } + + test("Test CTAS #5") { + val s5 = """CREATE TABLE ctas2 + | ROW FORMAT SERDE "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe" + | WITH SERDEPROPERTIES("serde_p1"="p1","serde_p2"="p2") + | STORED AS RCFile + | TBLPROPERTIES("tbl_p1"="p11", "tbl_p2"="p22") + | AS + | SELECT key, value + | FROM src + | ORDER BY key, value""".stripMargin + val (desc, exists) = extractTableDesc(s5) + assert(exists == false) + assert(desc.identifier.database == Some("default")) + assert(desc.identifier.table == "ctas2") + assert(desc.tableType == CatalogTableType.MANAGED) + assert(desc.storage.locationUri == None) + assert(desc.schema.isEmpty) + assert(desc.viewText == None) // TODO will be SQLText + assert(desc.viewCatalogAndNamespace.isEmpty) + assert(desc.viewQueryColumnNames.isEmpty) + assert(desc.storage.properties == Map(("serde_p1" -> "p1"), ("serde_p2" -> "p2"))) + assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat")) + assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileOutputFormat")) + assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe")) + assert(desc.properties == Map(("tbl_p1" -> "p11"), ("tbl_p2" -> "p22"))) + } + + test("CTAS statement with a PARTITIONED BY clause is not allowed") { + assertUnsupported(s"CREATE TABLE ctas1 PARTITIONED BY (k int)" + + " AS SELECT key, value FROM (SELECT 1 as key, 2 as value) tmp") + } + + test("CTAS statement with schema") { + assertUnsupported(s"CREATE TABLE ctas1 (age INT, name STRING) AS SELECT * FROM src") + assertUnsupported(s"CREATE TABLE ctas1 (age INT, name STRING) AS SELECT 1, 'hello'") + } + + test("create table - basic") { + val query = "CREATE TABLE my_table (id int, name string)" + val (desc, allowExisting) = extractTableDesc(query) + assert(!allowExisting) + assert(desc.identifier.database == Some("default")) + assert(desc.identifier.table == "my_table") + assert(desc.tableType == CatalogTableType.MANAGED) + assert(desc.schema == new StructType().add("id", "int").add("name", "string")) + assert(desc.partitionColumnNames.isEmpty) + assert(desc.bucketSpec.isEmpty) + assert(desc.viewText.isEmpty) + assert(desc.viewQueryColumnNames.isEmpty) + assert(desc.storage.locationUri.isEmpty) + assert(desc.storage.inputFormat == + Some("org.apache.hadoop.mapred.TextInputFormat")) + assert(desc.storage.outputFormat == + Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")) + assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) + assert(desc.storage.properties.isEmpty) + assert(desc.properties.isEmpty) + assert(desc.comment.isEmpty) + } + + test("create table - with database name") { + val query = "CREATE TABLE dbx.my_table (id int, name string)" + val (desc, _) = extractTableDesc(query) + assert(desc.identifier.database == Some("dbx")) + assert(desc.identifier.table == "my_table") + } + + test("create table - temporary") { + val query = "CREATE TEMPORARY TABLE tab1 (id int, name string)" + val e = intercept[ParseException] { parsePlan(query) } + assert(e.message.contains("Operation not allowed: CREATE TEMPORARY TABLE")) + } + + test("create table - external") { + val query = "CREATE EXTERNAL TABLE tab1 (id int, name string) LOCATION '/path/to/nowhere'" + val (desc, _) = extractTableDesc(query) + assert(desc.tableType == CatalogTableType.EXTERNAL) + assert(desc.storage.locationUri == Some(new URI("/path/to/nowhere"))) + } + + test("create table - if not exists") { + val query = "CREATE TABLE IF NOT EXISTS tab1 (id int, name string)" + val (_, allowExisting) = extractTableDesc(query) + assert(allowExisting) + } + + test("create table - comment") { + val query = "CREATE TABLE my_table (id int, name string) COMMENT 'its hot as hell below'" + val (desc, _) = extractTableDesc(query) + assert(desc.comment == Some("its hot as hell below")) + } + + test("create table - partitioned columns") { + val query = "CREATE TABLE my_table (id int, name string) PARTITIONED BY (month int)" + val (desc, _) = extractTableDesc(query) + assert(desc.schema == new StructType() + .add("id", "int") + .add("name", "string") + .add("month", "int")) + assert(desc.partitionColumnNames == Seq("month")) + } + + test("create table - clustered by") { + val numBuckets = 10 + val bucketedColumn = "id" + val sortColumn = "id" + val baseQuery = + s""" + CREATE TABLE my_table ( + $bucketedColumn int, + name string) + CLUSTERED BY($bucketedColumn) + """ + + val query1 = s"$baseQuery INTO $numBuckets BUCKETS" + val (desc1, _) = extractTableDesc(query1) + assert(desc1.bucketSpec.isDefined) + val bucketSpec1 = desc1.bucketSpec.get + assert(bucketSpec1.numBuckets == numBuckets) + assert(bucketSpec1.bucketColumnNames.head.equals(bucketedColumn)) + assert(bucketSpec1.sortColumnNames.isEmpty) + + val query2 = s"$baseQuery SORTED BY($sortColumn) INTO $numBuckets BUCKETS" + val (desc2, _) = extractTableDesc(query2) + assert(desc2.bucketSpec.isDefined) + val bucketSpec2 = desc2.bucketSpec.get + assert(bucketSpec2.numBuckets == numBuckets) + assert(bucketSpec2.bucketColumnNames.head.equals(bucketedColumn)) + assert(bucketSpec2.sortColumnNames.head.equals(sortColumn)) + } + + test("create table(hive) - skewed by") { + val baseQuery = "CREATE TABLE my_table (id int, name string) SKEWED BY" + val query1 = s"$baseQuery(id) ON (1, 10, 100)" + val query2 = s"$baseQuery(id, name) ON ((1, 'x'), (2, 'y'), (3, 'z'))" + val query3 = s"$baseQuery(id, name) ON ((1, 'x'), (2, 'y'), (3, 'z')) STORED AS DIRECTORIES" + val e1 = intercept[ParseException] { parsePlan(query1) } + val e2 = intercept[ParseException] { parsePlan(query2) } + val e3 = intercept[ParseException] { parsePlan(query3) } + assert(e1.getMessage.contains("Operation not allowed")) + assert(e2.getMessage.contains("Operation not allowed")) + assert(e3.getMessage.contains("Operation not allowed")) + } + + test("create table(hive) - row format") { + val baseQuery = "CREATE TABLE my_table (id int, name string) ROW FORMAT" + val query1 = s"$baseQuery SERDE 'org.apache.poof.serde.Baff'" + val query2 = s"$baseQuery SERDE 'org.apache.poof.serde.Baff' WITH SERDEPROPERTIES ('k1'='v1')" + val query3 = + s""" + |$baseQuery DELIMITED FIELDS TERMINATED BY 'x' ESCAPED BY 'y' + |COLLECTION ITEMS TERMINATED BY 'a' + |MAP KEYS TERMINATED BY 'b' + |LINES TERMINATED BY '\n' + |NULL DEFINED AS 'c' + """.stripMargin + val (desc1, _) = extractTableDesc(query1) + val (desc2, _) = extractTableDesc(query2) + val (desc3, _) = extractTableDesc(query3) + assert(desc1.storage.serde == Some("org.apache.poof.serde.Baff")) + assert(desc1.storage.properties.isEmpty) + assert(desc2.storage.serde == Some("org.apache.poof.serde.Baff")) + assert(desc2.storage.properties == Map("k1" -> "v1")) + assert(desc3.storage.properties == Map( + "field.delim" -> "x", + "escape.delim" -> "y", + "serialization.format" -> "x", + "line.delim" -> "\n", + "colelction.delim" -> "a", // yes, it's a typo from Hive :) + "mapkey.delim" -> "b")) + } + + test("create table(hive) - file format") { + val baseQuery = "CREATE TABLE my_table (id int, name string) STORED AS" + val query1 = s"$baseQuery INPUTFORMAT 'winput' OUTPUTFORMAT 'wowput'" + val query2 = s"$baseQuery ORC" + val (desc1, _) = extractTableDesc(query1) + val (desc2, _) = extractTableDesc(query2) + assert(desc1.storage.inputFormat == Some("winput")) + assert(desc1.storage.outputFormat == Some("wowput")) + assert(desc1.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) + assert(desc2.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat")) + assert(desc2.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat")) + assert(desc2.storage.serde == Some("org.apache.hadoop.hive.ql.io.orc.OrcSerde")) + } + + test("create table(hive) - storage handler") { + val baseQuery = "CREATE TABLE my_table (id int, name string) STORED BY" + val query1 = s"$baseQuery 'org.papachi.StorageHandler'" + val query2 = s"$baseQuery 'org.mamachi.StorageHandler' WITH SERDEPROPERTIES ('k1'='v1')" + val e1 = intercept[ParseException] { parsePlan(query1) } + val e2 = intercept[ParseException] { parsePlan(query2) } + assert(e1.getMessage.contains("Operation not allowed")) + assert(e2.getMessage.contains("Operation not allowed")) + } + + test("create table - properties") { + val query = "CREATE TABLE my_table (id int, name string) TBLPROPERTIES ('k1'='v1', 'k2'='v2')" + parsePlan(query) match { + case state: CreateTableStatement => + assert(state.properties == Map("k1" -> "v1", "k2" -> "v2")) + } + } + + test("create table(hive) - everything!") { + val query = + """ + |CREATE EXTERNAL TABLE IF NOT EXISTS dbx.my_table (id int, name string) + |COMMENT 'no comment' + |PARTITIONED BY (month int) + |ROW FORMAT SERDE 'org.apache.poof.serde.Baff' WITH SERDEPROPERTIES ('k1'='v1') + |STORED AS INPUTFORMAT 'winput' OUTPUTFORMAT 'wowput' + |LOCATION '/path/to/mercury' + |TBLPROPERTIES ('k1'='v1', 'k2'='v2') + """.stripMargin + val (desc, allowExisting) = extractTableDesc(query) + assert(allowExisting) + assert(desc.identifier.database == Some("dbx")) + assert(desc.identifier.table == "my_table") + assert(desc.tableType == CatalogTableType.EXTERNAL) + assert(desc.schema == new StructType() + .add("id", "int") + .add("name", "string") + .add("month", "int")) + assert(desc.partitionColumnNames == Seq("month")) + assert(desc.bucketSpec.isEmpty) + assert(desc.viewText.isEmpty) + assert(desc.viewCatalogAndNamespace.isEmpty) + assert(desc.viewQueryColumnNames.isEmpty) + assert(desc.storage.locationUri == Some(new URI("/path/to/mercury"))) + assert(desc.storage.inputFormat == Some("winput")) + assert(desc.storage.outputFormat == Some("wowput")) + assert(desc.storage.serde == Some("org.apache.poof.serde.Baff")) + assert(desc.storage.properties == Map("k1" -> "v1")) + assert(desc.properties == Map("k1" -> "v1", "k2" -> "v2")) + assert(desc.comment == Some("no comment")) + } + // TODO: add tests for more commands. } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala index 983209051c8ae..00c599065ce31 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala @@ -166,13 +166,13 @@ class CreateTableAsSelectSuite extends DataSourceTest with SharedSparkSession { ) }.getMessage assert(error.contains("Operation not allowed") && - error.contains("CREATE TEMPORARY TABLE ... USING ... AS query")) + error.contains("CREATE TEMPORARY TABLE")) } } test("disallows CREATE EXTERNAL TABLE ... USING ... AS query") { withTable("t") { - val error = intercept[ParseException] { + val error = intercept[AnalysisException] { sql( s""" |CREATE EXTERNAL TABLE t USING PARQUET diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 56b871644453b..b8b1da4cb9db7 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -598,8 +598,7 @@ class HiveDDLSuite val e = intercept[AnalysisException] { sql("CREATE TABLE tbl(a int) PARTITIONED BY (b) STORED AS parquet") } - assert(e.message.contains("Must specify a data type for each partition column while creating " + - "Hive partitioned table.")) + assert(e.message.contains("partition column b is not defined in table")) } test("add/drop partition with location - managed table") { @@ -2701,8 +2700,7 @@ class HiveDDLSuite |AS SELECT 1 as a, "a" as b """.stripMargin) }.getMessage - assert(err1.contains("Schema may not be specified in a Create Table As Select " + - "(CTAS) statement")) + assert(err1.contains("Schema may not be specified in a Create Table As Select")) val err2 = intercept[ParseException] { spark.sql( @@ -2713,8 +2711,7 @@ class HiveDDLSuite |AS SELECT 1 as a, "a" as b """.stripMargin) }.getMessage - assert(err2.contains("Create Partitioned Table As Select cannot specify data type for " + - "the partition columns of the target table")) + assert(err2.contains("Partition column types may not be specified in Create Table As Select")) } test("Hive CTAS with dynamic partition") { @@ -2783,7 +2780,7 @@ class HiveDDLSuite |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' """.stripMargin) }.getMessage - assert(e.contains("'ROW FORMAT' must be used with 'STORED AS'")) + assert(e.contains("Operation not allowed: CREATE TABLE LIKE ... USING ... ROW FORMAT SERDE")) // row format doesn't work with provider hive e = intercept[AnalysisException] { @@ -2794,7 +2791,7 @@ class HiveDDLSuite |WITH SERDEPROPERTIES ('test' = 'test') """.stripMargin) }.getMessage - assert(e.contains("'ROW FORMAT' must be used with 'STORED AS'")) + assert(e.contains("Operation not allowed: CREATE TABLE LIKE ... USING ... ROW FORMAT SERDE")) // row format doesn't work without 'STORED AS' e = intercept[AnalysisException] { @@ -2807,6 +2804,17 @@ class HiveDDLSuite }.getMessage assert(e.contains("'ROW FORMAT' must be used with 'STORED AS'")) + // 'INPUTFORMAT' and 'OUTPUTFORMAT' conflict with 'USING' + e = intercept[AnalysisException] { + spark.sql( + """ + |CREATE TABLE targetDsTable LIKE sourceDsTable USING format + |STORED AS INPUTFORMAT 'inFormat' OUTPUTFORMAT 'outFormat' + |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + """.stripMargin) + }.getMessage + assert(e.contains("Operation not allowed: CREATE TABLE LIKE ... USING ... STORED AS")) + // row format works with STORED AS hive format (from hive table) spark.sql( """ diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala index 24b1e3405379c..f723c9f80c2ab 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala @@ -21,11 +21,10 @@ import java.net.URI import org.scalatest.BeforeAndAfterAll -import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession} +import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.execution.command.{CreateTableCommand, DDLUtils} -import org.apache.spark.sql.execution.datasources.CreateTable import org.apache.spark.sql.execution.metric.InputOutputMetricsHelper import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} @@ -71,8 +70,8 @@ class HiveSerDeSuite extends HiveComparisonTest with PlanTest with BeforeAndAfte } private def extractTableDesc(sql: String): (CatalogTable, Boolean) = { - TestHive.sessionState.sqlParser.parsePlan(sql).collect { - case CreateTable(tableDesc, mode, _) => (tableDesc, mode == SaveMode.Ignore) + TestHive.sessionState.analyzer.execute(TestHive.sessionState.sqlParser.parsePlan(sql)).collect { + case CreateTableCommand(tableDesc, ifNotExists) => (tableDesc, ifNotExists) }.head } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 712f81d98753e..79b3c3efe531c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -712,8 +712,7 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi |AS SELECT key, value FROM mytable1 """.stripMargin) }.getMessage - assert(e.contains("Create Partitioned Table As Select cannot specify data type for " + - "the partition columns of the target table")) + assert(e.contains("Partition column types may not be specified in Create Table As Select")) } } } From d691d85701adc3db3b7545b87065f2a5113c2b99 Mon Sep 17 00:00:00 2001 From: Gengliang Wang Date: Wed, 25 Nov 2020 23:15:52 +0800 Subject: [PATCH 013/150] [SPARK-33496][SQL] Improve error message of ANSI explicit cast ### What changes were proposed in this pull request? After https://github.com/apache/spark/pull/30260, there are some type conversions disallowed under ANSI mode. We should tell users what they can do if they have to use the disallowed casting. ### Why are the changes needed? Make it more user-friendly. ### Does this PR introduce _any_ user-facing change? Yes, the error message is improved on casting failure when ANSI mode is enabled ### How was this patch tested? Unit tests. Closes #30440 from gengliangwang/improveAnsiCastErrorMSG. Authored-by: Gengliang Wang Signed-off-by: Gengliang Wang --- .../spark/sql/catalyst/expressions/Cast.scala | 51 ++++++++++++++++++- .../sql/catalyst/expressions/CastSuite.scala | 38 ++++++++++++-- 2 files changed, 82 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index e5f11b5e74916..e6f585cacc6c7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -262,6 +262,11 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit */ def canCast(from: DataType, to: DataType): Boolean + /** + * Returns the error message if casting from one type to another one is invalid. + */ + def typeCheckFailureMessage: String + override def toString: String = { val ansi = if (ansiEnabled) "ansi_" else "" s"${ansi}cast($child as ${dataType.simpleString})" @@ -271,8 +276,7 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit if (canCast(child.dataType, dataType)) { TypeCheckResult.TypeCheckSuccess } else { - TypeCheckResult.TypeCheckFailure( - s"cannot cast ${child.dataType.catalogString} to ${dataType.catalogString}") + TypeCheckResult.TypeCheckFailure(typeCheckFailureMessage) } } @@ -1755,6 +1759,12 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String } else { Cast.canCast(from, to) } + + override def typeCheckFailureMessage: String = if (ansiEnabled) { + AnsiCast.typeCheckFailureMessage(child.dataType, dataType, SQLConf.ANSI_ENABLED.key, "false") + } else { + s"cannot cast ${child.dataType.catalogString} to ${dataType.catalogString}" + } } /** @@ -1774,6 +1784,14 @@ case class AnsiCast(child: Expression, dataType: DataType, timeZoneId: Option[St override protected val ansiEnabled: Boolean = true override def canCast(from: DataType, to: DataType): Boolean = AnsiCast.canCast(from, to) + + // For now, this expression is only used in table insertion. + // If there are more scenarios for this expression, we should update the error message on type + // check failure. + override def typeCheckFailureMessage: String = + AnsiCast.typeCheckFailureMessage(child.dataType, dataType, + SQLConf.STORE_ASSIGNMENT_POLICY.key, SQLConf.StoreAssignmentPolicy.LEGACY.toString) + } object AnsiCast { @@ -1876,6 +1894,35 @@ object AnsiCast { case _ => false } + + def typeCheckFailureMessage( + from: DataType, + to: DataType, + fallbackConfKey: String, + fallbackConfValue: String): String = + (from, to) match { + case (_: NumericType, TimestampType) => + // scalastyle:off line.size.limit + s""" + | cannot cast ${from.catalogString} to ${to.catalogString}. + | To convert values from ${from.catalogString} to ${to.catalogString}, you can use functions TIMESTAMP_SECONDS/TIMESTAMP_MILLIS/TIMESTAMP_MICROS instead. + |""".stripMargin + + case (_: ArrayType, StringType) => + s""" + | cannot cast ${from.catalogString} to ${to.catalogString} with ANSI mode on. + | If you have to cast ${from.catalogString} to ${to.catalogString}, you can use the function ARRAY_JOIN or set $fallbackConfKey as $fallbackConfValue. + |""".stripMargin + + case _ if Cast.canCast(from, to) => + s""" + | cannot cast ${from.catalogString} to ${to.catalogString} with ANSI mode on. + | If you have to cast ${from.catalogString} to ${to.catalogString}, you can set $fallbackConfKey as $fallbackConfValue. + |""".stripMargin + + case _ => s"cannot cast ${from.catalogString} to ${to.catalogString}" + // scalastyle:on line.size.limit + } } /** diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala index 2bc27ad35efff..f1fc921e401ba 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala @@ -25,6 +25,7 @@ import scala.collection.parallel.immutable.ParVector import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.TypeCheckFailure import org.apache.spark.sql.catalyst.analysis.TypeCoercion.numericPrecedence import org.apache.spark.sql.catalyst.analysis.TypeCoercionSuite import org.apache.spark.sql.catalyst.expressions.aggregate.{CollectList, CollectSet} @@ -841,12 +842,28 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase { cast(Literal(134.12), DecimalType(3, 2)), "cannot be represented") } + protected def setConfigurationHint: String + + private def verifyCastFailure(c: CastBase, optionalExpectedMsg: Option[String] = None): Unit = { + val typeCheckResult = c.checkInputDataTypes() + assert(typeCheckResult.isFailure) + assert(typeCheckResult.isInstanceOf[TypeCheckFailure]) + val message = typeCheckResult.asInstanceOf[TypeCheckFailure].message + + if (optionalExpectedMsg.isDefined) { + assert(message.contains(optionalExpectedMsg.get)) + } else { + assert(message.contains("with ANSI mode on")) + assert(message.contains(setConfigurationHint)) + } + } + test("ANSI mode: disallow type conversions between Numeric types and Timestamp type") { import DataTypeTestUtils.numericTypes checkInvalidCastFromNumericType(TimestampType) val timestampLiteral = Literal(1L, TimestampType) numericTypes.foreach { numericType => - assert(cast(timestampLiteral, numericType).checkInputDataTypes().isFailure) + verifyCastFailure(cast(timestampLiteral, numericType)) } } @@ -855,7 +872,7 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase { checkInvalidCastFromNumericType(DateType) val dateLiteral = Literal(1, DateType) numericTypes.foreach { numericType => - assert(cast(dateLiteral, numericType).checkInputDataTypes().isFailure) + verifyCastFailure(cast(dateLiteral, numericType)) } } @@ -880,9 +897,9 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase { } test("ANSI mode: disallow casting complex types as String type") { - assert(cast(Literal.create(Array(1, 2, 3, 4, 5)), StringType).checkInputDataTypes().isFailure) - assert(cast(Literal.create(Map(1 -> "a")), StringType).checkInputDataTypes().isFailure) - assert(cast(Literal.create((1, "a", 0.1)), StringType).checkInputDataTypes().isFailure) + verifyCastFailure(cast(Literal.create(Array(1, 2, 3, 4, 5)), StringType)) + verifyCastFailure(cast(Literal.create(Map(1 -> "a")), StringType)) + verifyCastFailure(cast(Literal.create((1, "a", 0.1)), StringType)) } test("cast from invalid string to numeric should throw NumberFormatException") { @@ -1489,6 +1506,9 @@ class CastSuiteWithAnsiModeOn extends AnsiCastSuiteBase { case _ => Cast(Literal(v), targetType, timeZoneId) } } + + override def setConfigurationHint: String = + s"set ${SQLConf.ANSI_ENABLED.key} as false" } /** @@ -1511,6 +1531,10 @@ class AnsiCastSuiteWithAnsiModeOn extends AnsiCastSuiteBase { case _ => AnsiCast(Literal(v), targetType, timeZoneId) } } + + override def setConfigurationHint: String = + s"set ${SQLConf.STORE_ASSIGNMENT_POLICY.key} as" + + s" ${SQLConf.StoreAssignmentPolicy.LEGACY.toString}" } /** @@ -1533,4 +1557,8 @@ class AnsiCastSuiteWithAnsiModeOff extends AnsiCastSuiteBase { case _ => AnsiCast(Literal(v), targetType, timeZoneId) } } + + override def setConfigurationHint: String = + s"set ${SQLConf.STORE_ASSIGNMENT_POLICY.key} as" + + s" ${SQLConf.StoreAssignmentPolicy.LEGACY.toString}" } From 9643eab53e4bbaee08f7f8c766b0d1e0d9348d55 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 25 Nov 2020 08:55:39 -0800 Subject: [PATCH 014/150] [SPARK-33540][SQL] Subexpression elimination for interpreted predicate ### What changes were proposed in this pull request? This patch proposes to support subexpression elimination for interpreted predicate. ### Why are the changes needed? Similar to interpreted projection, there are use cases when codegen predicate is not able to work, e.g. too complex schema, non-codegen expression, etc. When there are frequently occurring expressions (subexpressions) among predicate expression, the performance is quite bad as we need to re-compute same expressions. We should be able to support subexpression elimination for interpreted predicate like interpreted projection. ### Does this PR introduce _any_ user-facing change? No, this doesn't change user behavior. ### How was this patch tested? Unit test and benchmark. Closes #30497 from viirya/SPARK-33540. Authored-by: Liang-Chi Hsieh Signed-off-by: Dongjoon Hyun --- .../sql/catalyst/expressions/predicates.scala | 19 +++++++++++++++++-- ...ExprEliminationBenchmark-jdk11-results.txt | 16 ++++++++-------- .../SubExprEliminationBenchmark-results.txt | 16 ++++++++-------- 3 files changed, 33 insertions(+), 18 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala index 53d6394d0d1f1..53ac3560bc3b3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala @@ -46,11 +46,26 @@ abstract class BasePredicate { } case class InterpretedPredicate(expression: Expression) extends BasePredicate { - override def eval(r: InternalRow): Boolean = expression.eval(r).asInstanceOf[Boolean] + private[this] val subExprEliminationEnabled = SQLConf.get.subexpressionEliminationEnabled + private[this] lazy val runtime = + new SubExprEvaluationRuntime(SQLConf.get.subexpressionEliminationCacheMaxEntries) + private[this] val expr = if (subExprEliminationEnabled) { + runtime.proxyExpressions(Seq(expression)).head + } else { + expression + } + + override def eval(r: InternalRow): Boolean = { + if (subExprEliminationEnabled) { + runtime.setInput(r) + } + + expr.eval(r).asInstanceOf[Boolean] + } override def initialize(partitionIndex: Int): Unit = { super.initialize(partitionIndex) - expression.foreach { + expr.foreach { case n: Nondeterministic => n.initialize(partitionIndex) case _ => } diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt index 1eb7b534d2194..a7f0acc3cdc86 100644 --- a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt @@ -7,19 +7,19 @@ OpenJDK 64-Bit Server VM 11.0.9+11 on Mac OS X 10.15.6 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz from_json as subExpr in Project: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -subExprElimination false, codegen: true 26447 27127 605 0.0 264467933.4 1.0X -subExprElimination false, codegen: false 25673 26035 546 0.0 256732419.1 1.0X -subExprElimination true, codegen: true 1384 1448 102 0.0 13842910.3 19.1X -subExprElimination true, codegen: false 1244 1347 123 0.0 12442389.3 21.3X +subExprElimination false, codegen: true 24827 25398 562 0.0 248271027.2 1.0X +subExprElimination false, codegen: false 25052 25704 625 0.0 250518603.6 1.0X +subExprElimination true, codegen: true 1540 1606 92 0.0 15403083.7 16.1X +subExprElimination true, codegen: false 1487 1535 53 0.0 14865051.6 16.7X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.9+11 on Mac OS X 10.15.6 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz from_json as subExpr in Filter: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -subexpressionElimination off, codegen on 34631 35449 833 0.0 346309884.0 1.0X -subexpressionElimination off, codegen on 34480 34851 353 0.0 344798490.4 1.0X -subexpressionElimination off, codegen on 16618 16811 291 0.0 166176642.6 2.1X -subexpressionElimination off, codegen on 34316 34667 310 0.0 343157094.7 1.0X +subexpressionElimination off, codegen on 37327 38261 809 0.0 373266387.0 1.0X +subexpressionElimination off, codegen on 36126 37445 1575 0.0 361263987.0 1.0X +subexpressionElimination off, codegen on 20152 21596 1263 0.0 201522903.8 1.9X +subexpressionElimination off, codegen on 20799 20940 233 0.0 207993923.0 1.8X diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt index 801f519ca76a1..e5f1bc14243e0 100644 --- a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt +++ b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt @@ -7,19 +7,19 @@ OpenJDK 64-Bit Server VM 1.8.0_265-b01 on Mac OS X 10.15.6 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz from_json as subExpr in Project: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -subExprElimination false, codegen: true 22767 23240 424 0.0 227665316.7 1.0X -subExprElimination false, codegen: false 22869 23351 465 0.0 228693464.1 1.0X -subExprElimination true, codegen: true 1328 1340 10 0.0 13280056.2 17.1X -subExprElimination true, codegen: false 1248 1276 31 0.0 12476135.1 18.2X +subExprElimination false, codegen: true 23094 23763 585 0.0 230939301.2 1.0X +subExprElimination false, codegen: false 23161 24087 844 0.0 231611379.8 1.0X +subExprElimination true, codegen: true 1492 1517 30 0.0 14921022.9 15.5X +subExprElimination true, codegen: false 1300 1361 93 0.0 12996167.7 17.8X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_265-b01 on Mac OS X 10.15.6 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz from_json as subExpr in Filter: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -subexpressionElimination off, codegen on 37691 38846 1004 0.0 376913767.9 1.0X -subexpressionElimination off, codegen on 37852 39124 1103 0.0 378517745.5 1.0X -subexpressionElimination off, codegen on 22900 23085 202 0.0 229000242.5 1.6X -subexpressionElimination off, codegen on 38298 38598 374 0.0 382978731.3 1.0X +subexpressionElimination off, codegen on 37069 37767 985 0.0 370694301.5 1.0X +subexpressionElimination off, codegen on 37095 37970 1008 0.0 370945081.6 1.0X +subexpressionElimination off, codegen on 20618 21443 715 0.0 206175173.8 1.8X +subexpressionElimination off, codegen on 21563 21887 307 0.0 215626274.7 1.7X From 7cf6a6f996e25754de13aa66badbe6d1d53efb36 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Wed, 25 Nov 2020 09:57:46 -0800 Subject: [PATCH 015/150] [SPARK-31257][SPARK-33561][SQL][FOLLOWUP] Fix Scala 2.13 compilation ### What changes were proposed in this pull request? This PR is a follow-up to fix Scala 2.13 compilation. ### Why are the changes needed? To support Scala 2.13 in Apache Spark 3.1. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass the GitHub Action Scala 2.13 compilation job. Closes #30502 from dongjoon-hyun/SPARK-31257. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun --- .../org/apache/spark/sql/catalyst/parser/AstBuilder.scala | 3 ++- .../scala/org/apache/spark/sql/execution/SparkSqlParser.scala | 2 +- .../spark/sql/execution/datasources/v2/V2SessionCatalog.scala | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 25423e510157a..606d923061441 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -2946,7 +2946,8 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg val location = visitLocationSpecList(ctx.locationSpec()) val (cleanedOptions, newLocation) = cleanTableOptions(ctx, options, location) val comment = visitCommentSpecList(ctx.commentSpec()) - val serdeInfo = getSerdeInfo(ctx.rowFormat.asScala, ctx.createFileFormat.asScala, ctx) + val serdeInfo = + getSerdeInfo(ctx.rowFormat.asScala.toSeq, ctx.createFileFormat.asScala.toSeq, ctx) (partTransforms, partCols, bucketSpec, cleanedProperties, cleanedOptions, newLocation, comment, serdeInfo) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index a92f0775f1c05..568c7112954f5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -440,7 +440,7 @@ class SparkSqlAstBuilder extends AstBuilder { val location = visitLocationSpecList(ctx.locationSpec()) // TODO: Do not skip serde check for CREATE TABLE LIKE. val serdeInfo = getSerdeInfo( - ctx.rowFormat.asScala, ctx.createFileFormat.asScala, ctx, skipCheck = true) + ctx.rowFormat.asScala.toSeq, ctx.createFileFormat.asScala.toSeq, ctx, skipCheck = true) if (provider.isDefined && serdeInfo.isDefined) { operationNotAllowed(s"CREATE TABLE LIKE ... USING ... ${serdeInfo.get.describe}", ctx) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala index f330d6a8c99e2..a0bc65d3f9057 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala @@ -114,7 +114,7 @@ class V2SessionCatalog(catalog: SessionCatalog) private def toOptions(properties: Map[String, String]): Map[String, String] = { properties.filterKeys(_.startsWith(TableCatalog.OPTION_PREFIX)).map { case (key, value) => key.drop(TableCatalog.OPTION_PREFIX.length) -> value - } + }.toMap } override def alterTable( From 1de3fc42829187c54334df1fb2149dc4aeb78ed9 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Wed, 25 Nov 2020 12:37:59 -0800 Subject: [PATCH 016/150] [SPARK-33525][SQL] Update hive-service-rpc to 3.1.2 ### What changes were proposed in this pull request? We supported Hive metastore are 0.12.0 through 3.1.2, but we supported hive-jdbc are 0.12.0 through 2.3.7. It will throw `TProtocolException` if we use hive-jdbc 3.x: ``` [rootspark-3267648 apache-hive-3.1.2-bin]# bin/beeline -u jdbc:hive2://localhost:10000/default Connecting to jdbc:hive2://localhost:10000/default Connected to: Spark SQL (version 3.1.0-SNAPSHOT) Driver: Hive JDBC (version 3.1.2) Transaction isolation: TRANSACTION_REPEATABLE_READ Beeline version 3.1.2 by Apache Hive 0: jdbc:hive2://localhost:10000/default> create table t1(id int) using parquet; Unexpected end of file when reading from HS2 server. The root cause might be too many concurrent connections. Please ask the administrator to check the number of active connections, and adjust hive.server2.thrift.max.worker.threads if applicable. Error: org.apache.thrift.transport.TTransportException (state=08S01,code=0) ``` ``` org.apache.thrift.protocol.TProtocolException: Missing version in readMessageBegin, old client? at org.apache.thrift.protocol.TBinaryProtocol.readMessageBegin(TBinaryProtocol.java:234) at org.apache.thrift.TBaseProcessor.process(TBaseProcessor.java:27) at org.apache.hive.service.auth.TSetIpAddressProcessor.process(TSetIpAddressProcessor.java:53) at org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:310) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:630) at java.base/java.lang.Thread.run(Thread.java:832) ``` This pr upgrade hive-service-rpc to 3.1.2 to fix this issue. ### Why are the changes needed? To support hive-jdbc 3.x. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manual test: ``` [rootspark-3267648 apache-hive-3.1.2-bin]# bin/beeline -u jdbc:hive2://localhost:10000/default Connecting to jdbc:hive2://localhost:10000/default Connected to: Spark SQL (version 3.1.0-SNAPSHOT) Driver: Hive JDBC (version 3.1.2) Transaction isolation: TRANSACTION_REPEATABLE_READ Beeline version 3.1.2 by Apache Hive 0: jdbc:hive2://localhost:10000/default> create table t1(id int) using parquet; +---------+ | Result | +---------+ +---------+ No rows selected (1.051 seconds) 0: jdbc:hive2://localhost:10000/default> insert into t1 values(1); +---------+ | Result | +---------+ +---------+ No rows selected (2.08 seconds) 0: jdbc:hive2://localhost:10000/default> select * from t1; +-----+ | id | +-----+ | 1 | +-----+ 1 row selected (0.605 seconds) ``` Closes #30478 from wangyum/SPARK-33525. Authored-by: Yuming Wang Signed-off-by: Dongjoon Hyun --- dev/deps/spark-deps-hadoop-2.7-hive-2.3 | 2 +- dev/deps/spark-deps-hadoop-3.2-hive-2.3 | 2 +- pom.xml | 2 +- .../apache/hive/service/cli/CLIService.java | 10 ++++++ .../apache/hive/service/cli/GetInfoType.java | 3 +- .../apache/hive/service/cli/ICLIService.java | 3 ++ .../cli/thrift/ThriftBinaryCLIService.java | 13 ++++++++ .../service/cli/thrift/ThriftCLIService.java | 31 +++++++++++++++++++ .../cli/thrift/ThriftCLIServiceClient.java | 9 ++++++ .../thriftserver/SparkSQLCLIService.scala | 1 + 10 files changed, 72 insertions(+), 4 deletions(-) diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 index bcf05506855c5..8802220726f78 100644 --- a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 @@ -88,7 +88,7 @@ hive-jdbc/2.3.7//hive-jdbc-2.3.7.jar hive-llap-common/2.3.7//hive-llap-common-2.3.7.jar hive-metastore/2.3.7//hive-metastore-2.3.7.jar hive-serde/2.3.7//hive-serde-2.3.7.jar -hive-service-rpc/2.3.7//hive-service-rpc-2.3.7.jar +hive-service-rpc/3.1.2//hive-service-rpc-3.1.2.jar hive-shims-0.23/2.3.7//hive-shims-0.23-2.3.7.jar hive-shims-common/2.3.7//hive-shims-common-2.3.7.jar hive-shims-scheduler/2.3.7//hive-shims-scheduler-2.3.7.jar diff --git a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 index cd274bef7045b..d45eeea0ee92b 100644 --- a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 @@ -65,7 +65,7 @@ hive-jdbc/2.3.7//hive-jdbc-2.3.7.jar hive-llap-common/2.3.7//hive-llap-common-2.3.7.jar hive-metastore/2.3.7//hive-metastore-2.3.7.jar hive-serde/2.3.7//hive-serde-2.3.7.jar -hive-service-rpc/2.3.7//hive-service-rpc-2.3.7.jar +hive-service-rpc/3.1.2//hive-service-rpc-3.1.2.jar hive-shims-0.23/2.3.7//hive-shims-0.23-2.3.7.jar hive-shims-common/2.3.7//hive-shims-common-2.3.7.jar hive-shims-scheduler/2.3.7//hive-shims-scheduler-2.3.7.jar diff --git a/pom.xml b/pom.xml index e5b1f30edd3be..cd7e1767d6b18 100644 --- a/pom.xml +++ b/pom.xml @@ -2088,7 +2088,7 @@ ${hive.group} hive-service-rpc - ${hive.version} + 3.1.2 * diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/CLIService.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/CLIService.java index bdc1e6251e560..68f044c6a0f28 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/CLIService.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/CLIService.java @@ -45,6 +45,7 @@ import org.apache.hive.service.cli.operation.Operation; import org.apache.hive.service.cli.session.HiveSession; import org.apache.hive.service.cli.session.SessionManager; +import org.apache.hive.service.rpc.thrift.TOperationHandle; import org.apache.hive.service.rpc.thrift.TProtocolVersion; import org.apache.hive.service.server.HiveServer2; import org.slf4j.Logger; @@ -567,6 +568,15 @@ public void renewDelegationToken(SessionHandle sessionHandle, HiveAuthFactory au LOG.info(sessionHandle + ": renewDelegationToken()"); } + @Override + public String getQueryId(TOperationHandle opHandle) throws HiveSQLException { + Operation operation = sessionManager.getOperationManager().getOperation( + new OperationHandle(opHandle)); + final String queryId = operation.getParentSession().getHiveConf().getVar(ConfVars.HIVEQUERYID); + LOG.debug(opHandle + ": getQueryId() " + queryId); + return queryId; + } + public SessionManager getSessionManager() { return sessionManager; } diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/GetInfoType.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/GetInfoType.java index a64d262a8f301..575dff8f8f47b 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/GetInfoType.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/GetInfoType.java @@ -72,7 +72,8 @@ public enum GetInfoType { CLI_DESCRIBE_PARAMETER(TGetInfoType.CLI_DESCRIBE_PARAMETER), CLI_CATALOG_NAME(TGetInfoType.CLI_CATALOG_NAME), CLI_COLLATION_SEQ(TGetInfoType.CLI_COLLATION_SEQ), - CLI_MAX_IDENTIFIER_LEN(TGetInfoType.CLI_MAX_IDENTIFIER_LEN); + CLI_MAX_IDENTIFIER_LEN(TGetInfoType.CLI_MAX_IDENTIFIER_LEN), + CLI_ODBC_KEYWORDS(TGetInfoType.CLI_ODBC_KEYWORDS); private final TGetInfoType tInfoType; diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/ICLIService.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/ICLIService.java index 3200909477821..a87c6691ebac7 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/ICLIService.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/ICLIService.java @@ -24,6 +24,7 @@ import org.apache.hive.service.auth.HiveAuthFactory; +import org.apache.hive.service.rpc.thrift.TOperationHandle; public interface ICLIService { @@ -98,6 +99,8 @@ RowSet fetchResults(OperationHandle opHandle, FetchOrientation orientation, String getDelegationToken(SessionHandle sessionHandle, HiveAuthFactory authFactory, String owner, String renewer) throws HiveSQLException; + String getQueryId(TOperationHandle operationHandle) throws HiveSQLException; + void cancelDelegationToken(SessionHandle sessionHandle, HiveAuthFactory authFactory, String tokenStr) throws HiveSQLException; diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java index ce79e3c8228a6..ffca1070d0047 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java @@ -32,7 +32,11 @@ import org.apache.hive.service.ServiceException; import org.apache.hive.service.auth.HiveAuthFactory; import org.apache.hive.service.cli.CLIService; +import org.apache.hive.service.cli.HiveSQLException; +import org.apache.hive.service.rpc.thrift.TGetQueryIdReq; +import org.apache.hive.service.rpc.thrift.TGetQueryIdResp; import org.apache.hive.service.server.ThreadFactoryWithGarbageCleanup; +import org.apache.thrift.TException; import org.apache.thrift.TProcessorFactory; import org.apache.thrift.protocol.TBinaryProtocol; import org.apache.thrift.server.TThreadPoolServer; @@ -107,6 +111,15 @@ protected void initializeServer() { } } + @Override + public TGetQueryIdResp GetQueryId(TGetQueryIdReq req) throws TException { + try { + return new TGetQueryIdResp(cliService.getQueryId(req.getOperationHandle())); + } catch (HiveSQLException e) { + throw new TException(e); + } + } + @Override public void run() { try { diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java index ea9ed57410045..150f1d60fc466 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java @@ -262,6 +262,28 @@ public TOpenSessionResp OpenSession(TOpenSessionReq req) throws TException { return resp; } + @Override + public TSetClientInfoResp SetClientInfo(TSetClientInfoReq req) throws TException { + // TODO: We don't do anything for now, just log this for debugging. + // We may be able to make use of this later, e.g. for workload management. + if (req.isSetConfiguration()) { + StringBuilder sb = null; + for (Map.Entry e : req.getConfiguration().entrySet()) { + if (sb == null) { + SessionHandle sh = new SessionHandle(req.getSessionHandle()); + sb = new StringBuilder("Client information for ").append(sh).append(": "); + } else { + sb.append(", "); + } + sb.append(e.getKey()).append(" = ").append(e.getValue()); + } + if (sb != null) { + LOG.info("{}", sb); + } + } + return new TSetClientInfoResp(OK_STATUS); + } + private String getIpAddress() { String clientIpAddress; // Http transport mode. @@ -674,6 +696,15 @@ public TGetCrossReferenceResp GetCrossReference(TGetCrossReferenceReq req) protected abstract void initializeServer(); + @Override + public TGetQueryIdResp GetQueryId(TGetQueryIdReq req) throws TException { + try { + return new TGetQueryIdResp(cliService.getQueryId(req.getOperationHandle())); + } catch (HiveSQLException e) { + throw new TException(e); + } + } + @Override public abstract void run(); diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIServiceClient.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIServiceClient.java index b13ddf72f77e7..0e81e4446caac 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIServiceClient.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIServiceClient.java @@ -490,4 +490,13 @@ public OperationHandle getCrossReference(SessionHandle sessionHandle, throw new HiveSQLException(e); } } + + @Override + public String getQueryId(TOperationHandle operationHandle) throws HiveSQLException { + try { + return cliService.GetQueryId(new TGetQueryIdReq(operationHandle)).getQueryId(); + } catch (TException e) { + throw new HiveSQLException(e); + } + } } diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala index df0fa514ccff3..e9420ad21bebd 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala @@ -104,6 +104,7 @@ private[hive] class SparkSQLCLIService(hiveServer: HiveServer2, sqlContext: SQLC case GetInfoType.CLI_SERVER_NAME => new GetInfoValue("Spark SQL") case GetInfoType.CLI_DBMS_NAME => new GetInfoValue("Spark SQL") case GetInfoType.CLI_DBMS_VER => new GetInfoValue(sqlContext.sparkContext.version) + case GetInfoType.CLI_ODBC_KEYWORDS => new GetInfoValue("Unimplemented") case _ => super.getInfo(sessionHandle, getInfoType) } } From c529426d872c6f09b05679ba76478e3b932e3696 Mon Sep 17 00:00:00 2001 From: shane knapp Date: Wed, 25 Nov 2020 15:15:50 -0800 Subject: [PATCH 017/150] [SPARK-33565][BUILD][PYTHON] remove python3.8 and fix breakage ### What changes were proposed in this pull request? remove python 3.8 from python/run-tests.py and stop build breaks ### Why are the changes needed? the python tests are running against the bare-bones system install of python3, rather than an anaconda environment. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? via jenkins Closes #30506 from shaneknapp/remove-py38. Authored-by: shane knapp Signed-off-by: shane knapp --- python/run-tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/run-tests.py b/python/run-tests.py index 712f38fb81b83..34800b0e9fa54 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -160,7 +160,7 @@ def run_individual_python_test(target_dir, test_name, pyspark_python): def get_default_python_executables(): - python_execs = [x for x in ["python3.6", "python3.8", "pypy3"] if which(x)] + python_execs = [x for x in ["python3.6", "pypy3"] if which(x)] if "python3.6" not in python_execs: p = which("python3") From fb7b87021437c52d72ad276f92c8d6f5443ebd78 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 25 Nov 2020 15:22:47 -0800 Subject: [PATCH 018/150] [SPARK-33523][SQL][TEST][FOLLOWUP] Fix benchmark case name in SubExprEliminationBenchmark ### What changes were proposed in this pull request? Fix the wrong benchmark case name. ### Why are the changes needed? The last commit to refactor the benchmark code missed a change of case name. ### Does this PR introduce _any_ user-facing change? No, dev only. ### How was this patch tested? Unit test. Closes #30505 from viirya/SPARK-33523-followup. Authored-by: Liang-Chi Hsieh Signed-off-by: Dongjoon Hyun --- ...SubExprEliminationBenchmark-jdk11-results.txt | 16 ++++++++-------- .../SubExprEliminationBenchmark-results.txt | 16 ++++++++-------- .../execution/SubExprEliminationBenchmark.scala | 2 +- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt index a7f0acc3cdc86..5eeb485a921b8 100644 --- a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt @@ -7,19 +7,19 @@ OpenJDK 64-Bit Server VM 11.0.9+11 on Mac OS X 10.15.6 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz from_json as subExpr in Project: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -subExprElimination false, codegen: true 24827 25398 562 0.0 248271027.2 1.0X -subExprElimination false, codegen: false 25052 25704 625 0.0 250518603.6 1.0X -subExprElimination true, codegen: true 1540 1606 92 0.0 15403083.7 16.1X -subExprElimination true, codegen: false 1487 1535 53 0.0 14865051.6 16.7X +subExprElimination false, codegen: true 22482 23194 652 0.0 224817884.1 1.0X +subExprElimination false, codegen: false 22544 22658 155 0.0 225436869.9 1.0X +subExprElimination true, codegen: true 1371 1403 34 0.0 13710714.3 16.4X +subExprElimination true, codegen: false 1295 1317 20 0.0 12949824.3 17.4X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.9+11 on Mac OS X 10.15.6 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz from_json as subExpr in Filter: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -subexpressionElimination off, codegen on 37327 38261 809 0.0 373266387.0 1.0X -subexpressionElimination off, codegen on 36126 37445 1575 0.0 361263987.0 1.0X -subexpressionElimination off, codegen on 20152 21596 1263 0.0 201522903.8 1.9X -subexpressionElimination off, codegen on 20799 20940 233 0.0 207993923.0 1.8X +subExprElimination false, codegen: true 34976 35331 326 0.0 349759975.5 1.0X +subExprElimination false, codegen: false 34101 34802 607 0.0 341014685.7 1.0X +subExprElimination true, codegen: true 19440 19622 272 0.0 194402251.0 1.8X +subExprElimination true, codegen: false 19247 20064 719 0.0 192466667.6 1.8X diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt index e5f1bc14243e0..49a107f542857 100644 --- a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt +++ b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt @@ -7,19 +7,19 @@ OpenJDK 64-Bit Server VM 1.8.0_265-b01 on Mac OS X 10.15.6 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz from_json as subExpr in Project: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -subExprElimination false, codegen: true 23094 23763 585 0.0 230939301.2 1.0X -subExprElimination false, codegen: false 23161 24087 844 0.0 231611379.8 1.0X -subExprElimination true, codegen: true 1492 1517 30 0.0 14921022.9 15.5X -subExprElimination true, codegen: false 1300 1361 93 0.0 12996167.7 17.8X +subExprElimination false, codegen: true 25399 25869 466 0.0 253992369.6 1.0X +subExprElimination false, codegen: false 24086 25094 888 0.0 240858699.5 1.1X +subExprElimination true, codegen: true 1527 1600 64 0.0 15274388.8 16.6X +subExprElimination true, codegen: false 1560 1600 52 0.0 15597825.4 16.3X Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_265-b01 on Mac OS X 10.15.6 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz from_json as subExpr in Filter: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -subexpressionElimination off, codegen on 37069 37767 985 0.0 370694301.5 1.0X -subexpressionElimination off, codegen on 37095 37970 1008 0.0 370945081.6 1.0X -subexpressionElimination off, codegen on 20618 21443 715 0.0 206175173.8 1.8X -subexpressionElimination off, codegen on 21563 21887 307 0.0 215626274.7 1.7X +subExprElimination false, codegen: true 39661 40585 844 0.0 396612867.5 1.0X +subExprElimination false, codegen: false 40633 48813 1858 0.0 406328241.3 1.0X +subExprElimination true, codegen: true 25819 27096 1174 0.0 258194064.4 1.5X +subExprElimination true, codegen: false 23467 25137 1447 0.0 234668398.2 1.7X diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SubExprEliminationBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SubExprEliminationBenchmark.scala index e26acbcb3cd21..0ed0126add7a2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SubExprEliminationBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SubExprEliminationBenchmark.scala @@ -100,7 +100,7 @@ object SubExprEliminationBenchmark extends SqlBasedBenchmark { // We only benchmark subexpression performance under codegen/non-codegen, so disabling // json optimization. val caseName = s"subExprElimination $subExprEliminationEnabled, codegen: $codegenEnabled" - benchmark.addCase("subexpressionElimination off, codegen on", numIters) { _ => + benchmark.addCase(caseName, numIters) { _ => withSQLConf( SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> subExprEliminationEnabled, SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegenEnabled, From 919ea45e89b17d2f9b336dc4bfe6e15e8a083ed3 Mon Sep 17 00:00:00 2001 From: Gengliang Wang Date: Thu, 26 Nov 2020 10:19:38 +0900 Subject: [PATCH 019/150] [SPARK-33562][UI] Improve the style of the checkbox in executor page ### What changes were proposed in this pull request? 1. Remove the fixed width style of class `container-fluid-div`. So that the UI looks clean when the text is long. 2. Add one space between a checkbox and the text on the right side, which is consistent with the stage page. ### Why are the changes needed? The width of class `container-fluid-div` is set as 200px after https://github.com/apache/spark/pull/21688 . This makes the checkbox in the executor page messy. ![image](https://user-images.githubusercontent.com/1097932/100242069-3bc5ab80-2ee9-11eb-8c7d-96c221398fee.png) We should remove the width limit. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Manual test. After the changes: ![image](https://user-images.githubusercontent.com/1097932/100257802-2f4a4e80-2efb-11eb-9eb0-92d6988ad14b.png) Closes #30500 from gengliangwang/reviseStyle. Authored-by: Gengliang Wang Signed-off-by: HyukjinKwon --- .../apache/spark/ui/static/executorspage.js | 18 +++++++++--------- .../org/apache/spark/ui/static/webui.css | 4 ---- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js index 4f179a93c9d5f..1d3f628f5fab6 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js +++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js @@ -608,15 +608,15 @@ $(document).ready(function () { "Show Additional Metrics" + "" + "

" + - "
Select All
" + - "
On Heap Memory
" + - "
Off Heap Memory
" + - "
Peak JVM Memory OnHeap / OffHeap
" + - "
Peak Execution Memory OnHeap / OffHeap
" + - "
Peak Storage Memory OnHeap / OffHeap
" + - "
Peak Pool Memory Direct / Mapped
" + - "
Resources
" + - "
Resource Profile Id
" + + "
Select All
" + + "
On Heap Memory
" + + "
Off Heap Memory
" + + "
Peak JVM Memory OnHeap / OffHeap
" + + "
Peak Execution Memory OnHeap / OffHeap
" + + "
Peak Storage Memory OnHeap / OffHeap
" + + "
Peak Pool Memory Direct / Mapped
" + + "
Resources
" + + "
Resource Profile Id
" + "
"); reselectCheckboxesBasedOnTaskTableState(); diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.css b/core/src/main/resources/org/apache/spark/ui/static/webui.css index d4394ebcfd258..262cee7b58aff 100755 --- a/core/src/main/resources/org/apache/spark/ui/static/webui.css +++ b/core/src/main/resources/org/apache/spark/ui/static/webui.css @@ -321,10 +321,6 @@ a.expandbutton { width: 100%; } -.container-fluid-div { - width: 200px; -} - .select-all-div-checkbox-div { width: 90px; } From ed9e6fc18236ef6994c7f24a4017cf43f77b7ca1 Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Thu, 26 Nov 2020 11:42:12 +0900 Subject: [PATCH 020/150] [SPARK-33565][INFRA][FOLLOW-UP] Keep the test coverage with Python 3.8 in GitHub Actions ### What changes were proposed in this pull request? This PR proposes to keep the test coverage with Python 3.8 in GitHub Actions. It is not tested for now in Jenkins due to an env issue. **Before this change in GitHub Actions:** ``` ======================================================================== Running PySpark tests ======================================================================== Running PySpark tests. Output is in /__w/spark/spark/python/unit-tests.log Will test against the following Python executables: ['python3.6', 'pypy3'] ... ``` **After this change in GitHub Actions:** ``` ======================================================================== Running PySpark tests ======================================================================== Running PySpark tests. Output is in /__w/spark/spark/python/unit-tests.log Will test against the following Python executables: ['python3.6', 'python3.8', 'pypy3'] ``` ### Why are the changes needed? To keep the test coverage with Python 3.8 in GitHub Actions. ### Does this PR introduce _any_ user-facing change? No, dev-only. ### How was this patch tested? GitHub Actions in this build will test. Closes #30510 from HyukjinKwon/SPARK-33565. Authored-by: HyukjinKwon Signed-off-by: HyukjinKwon --- dev/run-tests.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dev/run-tests.py b/dev/run-tests.py index 5bdbc0ffb850c..6bc73ca3669f3 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -483,6 +483,12 @@ def run_python_tests(test_modules, parallelism, with_coverage=False): if test_modules != [modules.root]: command.append("--modules=%s" % ','.join(m.name for m in test_modules)) command.append("--parallelism=%i" % parallelism) + if "GITHUB_ACTIONS" in os.environ: + # See SPARK-33565. Python 3.8 was temporarily removed as its default Python executables + # to test because of Jenkins environment issue. Once Jenkins has Python 3.8 to test, + # we should remove this change back and add python3.8 into python/run-tests.py script. + command.append("--python-executable=%s" % ','.join( + x for x in ["python3.6", "python3.8", "pypy3"] if which(x))) run_cmd(command) if with_coverage: From dfa3978d9191e02eabf65d1829c970644d25d57e Mon Sep 17 00:00:00 2001 From: Maryann Xue Date: Wed, 25 Nov 2020 19:32:22 -0800 Subject: [PATCH 021/150] [SPARK-33551][SQL] Do not use custom shuffle reader for repartition ### What changes were proposed in this pull request? This PR fixes an AQE issue where local shuffle reader, partition coalescing, or skew join optimization can be mistakenly applied to a shuffle introduced by repartition or a regular shuffle that logically replaces a repartition shuffle. The proposed solution checks for the presence of any repartition shuffle and filters out not applicable optimization rules for the final stage in an AQE plan. ### Why are the changes needed? Without the change, the output of a repartition query may not be correct. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added UT. Closes #30494 from maryannxue/csr-repartition. Authored-by: Maryann Xue Signed-off-by: Xiao Li --- .../apache/spark/sql/internal/SQLConf.scala | 2 +- .../adaptive/AdaptiveSparkPlanExec.scala | 31 +++-- .../adaptive/CoalesceShufflePartitions.scala | 11 +- .../adaptive/CustomShuffleReaderRule.scala | 33 +++++ .../adaptive/OptimizeLocalShuffleReader.scala | 9 +- .../adaptive/OptimizeSkewedJoin.scala | 14 ++- .../adaptive/AdaptiveQueryExecSuite.scala | 116 +++++++++++++++++- 7 files changed, 187 insertions(+), 29 deletions(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffleReaderRule.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 0738478888aeb..add9a1d0f3aa6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -509,7 +509,7 @@ object SQLConf { "'spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes'") .version("3.0.0") .intConf - .checkValue(_ > 0, "The skew factor must be positive.") + .checkValue(_ >= 0, "The skew factor cannot be negative.") .createWithDefault(5) val SKEW_JOIN_SKEWED_PARTITION_THRESHOLD = diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 570edbf5f78a3..89d3b53510469 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -37,8 +37,6 @@ import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec._ import org.apache.spark.sql.execution.bucketing.DisableUnnecessaryBucketedScan -import org.apache.spark.sql.execution.command.DataWritingCommandExec -import org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec import org.apache.spark.sql.execution.exchange._ import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SparkListenerSQLAdaptiveSQLMetricUpdates, SQLPlanMetric} import org.apache.spark.sql.internal.SQLConf @@ -104,16 +102,6 @@ case class AdaptiveSparkPlanExec( OptimizeLocalShuffleReader ) - private def finalStageOptimizerRules: Seq[Rule[SparkPlan]] = - context.qe.sparkPlan match { - case _: DataWritingCommandExec | _: V2TableWriteExec => - // SPARK-32932: Local shuffle reader could break partitioning that works best - // for the following writing command - queryStageOptimizerRules.filterNot(_ == OptimizeLocalShuffleReader) - case _ => - queryStageOptimizerRules - } - // A list of physical optimizer rules to be applied right after a new stage is created. The input // plan to these rules has exchange as its root node. @transient private val postStageCreationRules = Seq( @@ -121,6 +109,23 @@ case class AdaptiveSparkPlanExec( CollapseCodegenStages() ) + // The partitioning of the query output depends on the shuffle(s) in the final stage. If the + // original plan contains a repartition operator, we need to preserve the specified partitioning, + // whether or not the repartition-introduced shuffle is optimized out because of an underlying + // shuffle of the same partitioning. Thus, we need to exclude some `CustomShuffleReaderRule`s + // from the final stage, depending on the presence and properties of repartition operators. + private def finalStageOptimizerRules: Seq[Rule[SparkPlan]] = { + val origins = inputPlan.collect { + case s: ShuffleExchangeLike => s.shuffleOrigin + } + val allRules = queryStageOptimizerRules ++ postStageCreationRules + allRules.filter { + case c: CustomShuffleReaderRule => + origins.forall(c.supportedShuffleOrigins.contains) + case _ => true + } + } + @transient private val costEvaluator = SimpleCostEvaluator @transient private val initialPlan = context.session.withActive { @@ -249,7 +254,7 @@ case class AdaptiveSparkPlanExec( // Run the final plan when there's no more unfinished stages. currentPhysicalPlan = applyPhysicalRules( result.newPlan, - finalStageOptimizerRules ++ postStageCreationRules, + finalStageOptimizerRules, Some((planChangeLogger, "AQE Final Query Stage Optimization"))) isFinalPlan = true executionId.foreach(onUpdatePlan(_, Seq(currentPhysicalPlan))) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala index 0cf3ab0cca49a..0f482142227d2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala @@ -19,16 +19,18 @@ package org.apache.spark.sql.execution.adaptive import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.plans.physical.SinglePartition -import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, REPARTITION, ShuffleExchangeLike} +import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, REPARTITION, ShuffleExchangeLike, ShuffleOrigin} import org.apache.spark.sql.internal.SQLConf /** * A rule to coalesce the shuffle partitions based on the map output statistics, which can * avoid many small reduce tasks that hurt performance. */ -case class CoalesceShufflePartitions(session: SparkSession) extends Rule[SparkPlan] { +case class CoalesceShufflePartitions(session: SparkSession) extends CustomShuffleReaderRule { + + override val supportedShuffleOrigins: Seq[ShuffleOrigin] = Seq(ENSURE_REQUIREMENTS, REPARTITION) + override def apply(plan: SparkPlan): SparkPlan = { if (!conf.coalesceShufflePartitionsEnabled) { return plan @@ -86,7 +88,6 @@ case class CoalesceShufflePartitions(session: SparkSession) extends Rule[SparkPl } private def supportCoalesce(s: ShuffleExchangeLike): Boolean = { - s.outputPartitioning != SinglePartition && - (s.shuffleOrigin == ENSURE_REQUIREMENTS || s.shuffleOrigin == REPARTITION) + s.outputPartitioning != SinglePartition && supportedShuffleOrigins.contains(s.shuffleOrigin) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffleReaderRule.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffleReaderRule.scala new file mode 100644 index 0000000000000..c5b8f73ea59d3 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffleReaderRule.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.adaptive + +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.exchange.ShuffleOrigin + +/** + * Adaptive Query Execution rule that may create [[CustomShuffleReaderExec]] on top of query stages. + */ +trait CustomShuffleReaderRule extends Rule[SparkPlan] { + + /** + * Returns the list of [[ShuffleOrigin]]s supported by this rule. + */ + def supportedShuffleOrigins: Seq[ShuffleOrigin] +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala index 8f57947cb6396..4dc982d666d18 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala @@ -19,9 +19,8 @@ package org.apache.spark.sql.execution.adaptive import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide} import org.apache.spark.sql.catalyst.plans.physical.SinglePartition -import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, EnsureRequirements, ShuffleExchangeExec, ShuffleExchangeLike} +import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, EnsureRequirements, ShuffleExchangeExec, ShuffleExchangeLike, ShuffleOrigin} import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec import org.apache.spark.sql.internal.SQLConf @@ -34,7 +33,9 @@ import org.apache.spark.sql.internal.SQLConf * then run `EnsureRequirements` to check whether additional shuffle introduced. * If introduced, we will revert all the local readers. */ -object OptimizeLocalShuffleReader extends Rule[SparkPlan] { +object OptimizeLocalShuffleReader extends CustomShuffleReaderRule { + + override val supportedShuffleOrigins: Seq[ShuffleOrigin] = Seq(ENSURE_REQUIREMENTS) private val ensureRequirements = EnsureRequirements @@ -144,6 +145,6 @@ object OptimizeLocalShuffleReader extends Rule[SparkPlan] { } private def supportLocalReader(s: ShuffleExchangeLike): Boolean = { - s.outputPartitioning != SinglePartition && s.shuffleOrigin == ENSURE_REQUIREMENTS + s.outputPartitioning != SinglePartition && supportedShuffleOrigins.contains(s.shuffleOrigin) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala index 582d586c59358..085934d906b3c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala @@ -23,9 +23,8 @@ import org.apache.commons.io.FileUtils import org.apache.spark.{MapOutputStatistics, MapOutputTrackerMaster, SparkEnv} import org.apache.spark.sql.catalyst.plans._ -import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ShuffleExchangeExec} +import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, EnsureRequirements, ShuffleExchangeExec, ShuffleOrigin} import org.apache.spark.sql.execution.joins.SortMergeJoinExec import org.apache.spark.sql.internal.SQLConf @@ -53,7 +52,9 @@ import org.apache.spark.sql.internal.SQLConf * Note that, when this rule is enabled, it also coalesces non-skewed partitions like * `CoalesceShufflePartitions` does. */ -object OptimizeSkewedJoin extends Rule[SparkPlan] { +object OptimizeSkewedJoin extends CustomShuffleReaderRule { + + override val supportedShuffleOrigins: Seq[ShuffleOrigin] = Seq(ENSURE_REQUIREMENTS) private val ensureRequirements = EnsureRequirements @@ -290,7 +291,9 @@ object OptimizeSkewedJoin extends Rule[SparkPlan] { private object ShuffleStage { def unapply(plan: SparkPlan): Option[ShuffleStageInfo] = plan match { - case s: ShuffleQueryStageExec if s.mapStats.isDefined => + case s: ShuffleQueryStageExec + if s.mapStats.isDefined && + OptimizeSkewedJoin.supportedShuffleOrigins.contains(s.shuffle.shuffleOrigin) => val mapStats = s.mapStats.get val sizes = mapStats.bytesByPartitionId val partitions = sizes.zipWithIndex.map { @@ -299,7 +302,8 @@ private object ShuffleStage { Some(ShuffleStageInfo(s, mapStats, partitions)) case CustomShuffleReaderExec(s: ShuffleQueryStageExec, partitionSpecs) - if s.mapStats.isDefined && partitionSpecs.nonEmpty => + if s.mapStats.isDefined && partitionSpecs.nonEmpty && + OptimizeSkewedJoin.supportedShuffleOrigins.contains(s.shuffle.shuffleOrigin) => val mapStats = s.mapStats.get val sizes = mapStats.bytesByPartitionId val partitions = partitionSpecs.map { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index 758965954b374..45ba2202d83d3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.execution.{PartialReducerPartitionSpec, QueryExecuti import org.apache.spark.sql.execution.command.DataWritingCommandExec import org.apache.spark.sql.execution.datasources.noop.NoopDataSource import org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec -import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, Exchange, ReusedExchangeExec, ShuffleExchangeExec} +import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, Exchange, REPARTITION, REPARTITION_WITH_NUM, ReusedExchangeExec, ShuffleExchangeExec, ShuffleExchangeLike} import org.apache.spark.sql.execution.joins.{BaseJoinExec, BroadcastHashJoinExec, SortMergeJoinExec} import org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate import org.apache.spark.sql.functions._ @@ -1317,4 +1317,118 @@ class AdaptiveQueryExecSuite checkNumLocalShuffleReaders(df.queryExecution.executedPlan, numShufflesWithoutLocalReader = 1) } } + + test("SPARK-33551: Do not use custom shuffle reader for repartition") { + def hasRepartitionShuffle(plan: SparkPlan): Boolean = { + find(plan) { + case s: ShuffleExchangeLike => + s.shuffleOrigin == REPARTITION || s.shuffleOrigin == REPARTITION_WITH_NUM + case _ => false + }.isDefined + } + + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.SHUFFLE_PARTITIONS.key -> "5") { + val df = sql( + """ + |SELECT * FROM ( + | SELECT * FROM testData WHERE key = 1 + |) + |RIGHT OUTER JOIN testData2 + |ON value = b + """.stripMargin) + + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "80") { + // Repartition with no partition num specified. + val dfRepartition = df.repartition('b) + dfRepartition.collect() + val plan = dfRepartition.queryExecution.executedPlan + // The top shuffle from repartition is optimized out. + assert(!hasRepartitionShuffle(plan)) + val bhj = findTopLevelBroadcastHashJoin(plan) + assert(bhj.length == 1) + checkNumLocalShuffleReaders(plan, 1) + // Probe side is coalesced. + val customReader = bhj.head.right.find(_.isInstanceOf[CustomShuffleReaderExec]) + assert(customReader.isDefined) + assert(customReader.get.asInstanceOf[CustomShuffleReaderExec].hasCoalescedPartition) + + // Repartition with partition default num specified. + val dfRepartitionWithNum = df.repartition(5, 'b) + dfRepartitionWithNum.collect() + val planWithNum = dfRepartitionWithNum.queryExecution.executedPlan + // The top shuffle from repartition is optimized out. + assert(!hasRepartitionShuffle(planWithNum)) + val bhjWithNum = findTopLevelBroadcastHashJoin(planWithNum) + assert(bhjWithNum.length == 1) + checkNumLocalShuffleReaders(planWithNum, 1) + // Probe side is not coalesced. + assert(bhjWithNum.head.right.find(_.isInstanceOf[CustomShuffleReaderExec]).isEmpty) + + // Repartition with partition non-default num specified. + val dfRepartitionWithNum2 = df.repartition(3, 'b) + dfRepartitionWithNum2.collect() + val planWithNum2 = dfRepartitionWithNum2.queryExecution.executedPlan + // The top shuffle from repartition is not optimized out, and this is the only shuffle that + // does not have local shuffle reader. + assert(hasRepartitionShuffle(planWithNum2)) + val bhjWithNum2 = findTopLevelBroadcastHashJoin(planWithNum2) + assert(bhjWithNum2.length == 1) + checkNumLocalShuffleReaders(planWithNum2, 1) + val customReader2 = bhjWithNum2.head.right.find(_.isInstanceOf[CustomShuffleReaderExec]) + assert(customReader2.isDefined) + assert(customReader2.get.asInstanceOf[CustomShuffleReaderExec].isLocalReader) + } + + // Force skew join + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.SKEW_JOIN_ENABLED.key -> "true", + SQLConf.SKEW_JOIN_SKEWED_PARTITION_THRESHOLD.key -> "1", + SQLConf.SKEW_JOIN_SKEWED_PARTITION_FACTOR.key -> "0", + SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "10") { + // Repartition with no partition num specified. + val dfRepartition = df.repartition('b) + dfRepartition.collect() + val plan = dfRepartition.queryExecution.executedPlan + // The top shuffle from repartition is optimized out. + assert(!hasRepartitionShuffle(plan)) + val smj = findTopLevelSortMergeJoin(plan) + assert(smj.length == 1) + // No skew join due to the repartition. + assert(!smj.head.isSkewJoin) + // Both sides are coalesced. + val customReaders = collect(smj.head) { + case c: CustomShuffleReaderExec if c.hasCoalescedPartition => c + } + assert(customReaders.length == 2) + + // Repartition with default partition num specified. + val dfRepartitionWithNum = df.repartition(5, 'b) + dfRepartitionWithNum.collect() + val planWithNum = dfRepartitionWithNum.queryExecution.executedPlan + // The top shuffle from repartition is optimized out. + assert(!hasRepartitionShuffle(planWithNum)) + val smjWithNum = findTopLevelSortMergeJoin(planWithNum) + assert(smjWithNum.length == 1) + // No skew join due to the repartition. + assert(!smjWithNum.head.isSkewJoin) + // No coalesce due to the num in repartition. + val customReadersWithNum = collect(smjWithNum.head) { + case c: CustomShuffleReaderExec if c.hasCoalescedPartition => c + } + assert(customReadersWithNum.isEmpty) + + // Repartition with default non-partition num specified. + val dfRepartitionWithNum2 = df.repartition(3, 'b) + dfRepartitionWithNum2.collect() + val planWithNum2 = dfRepartitionWithNum2.queryExecution.executedPlan + // The top shuffle from repartition is not optimized out. + assert(hasRepartitionShuffle(planWithNum2)) + val smjWithNum2 = findTopLevelSortMergeJoin(planWithNum2) + assert(smjWithNum2.length == 1) + // Skew join can apply as the repartition is not optimized out. + assert(smjWithNum2.head.isSkewJoin) + } + } + } } From d082ad0abfe0bc26760626ae0ecb415a8d508a1f Mon Sep 17 00:00:00 2001 From: zero323 Date: Fri, 27 Nov 2020 11:00:09 +0900 Subject: [PATCH 022/150] [SPARK-33563][PYTHON][R][SQL] Expose inverse hyperbolic trig functions in PySpark and SparkR ### What changes were proposed in this pull request? This PR adds the following functions (introduced in Scala API with SPARK-33061): - `acosh` - `asinh` - `atanh` to Python and R. ### Why are the changes needed? Feature parity. ### Does this PR introduce _any_ user-facing change? New functions. ### How was this patch tested? New unit tests. Closes #30501 from zero323/SPARK-33563. Authored-by: zero323 Signed-off-by: HyukjinKwon --- R/pkg/NAMESPACE | 3 ++ R/pkg/R/functions.R | 39 ++++++++++++++++++++ R/pkg/tests/fulltests/test_sparkSQL.R | 1 + python/docs/source/reference/pyspark.sql.rst | 4 +- python/pyspark/sql/functions.py | 39 ++++++++++++++++++++ python/pyspark/sql/functions.pyi | 3 ++ python/pyspark/sql/tests/test_functions.py | 16 ++++++++ 7 files changed, 104 insertions(+), 1 deletion(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index b927a6b96b810..91f6e6dc8a0e6 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -202,6 +202,7 @@ exportMethods("%<=>%", "%in%", "abs", "acos", + "acosh", "add_months", "alias", "approx_count_distinct", @@ -232,8 +233,10 @@ exportMethods("%<=>%", "asc_nulls_last", "ascii", "asin", + "asinh", "assert_true", "atan", + "atanh", "atan2", "avg", "base64", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 039d28a3a37b6..b12f7b472ec83 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -455,6 +455,19 @@ setMethod("acos", column(jc) }) +#' @details +#' \code{acosh}: Computes inverse hyperbolic cosine of the input column. +#' +#' @rdname column_math_functions +#' @aliases acosh acosh,Column-method +#' @note acosh since 3.1.0 +setMethod("acosh", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "acosh", x@jc) + column(jc) + }) + #' @details #' \code{approx_count_distinct}: Returns the approximate number of distinct items in a group. #' @@ -522,6 +535,19 @@ setMethod("asin", column(jc) }) +#' @details +#' \code{asinh}: Computes inverse hyperbolic sine of the input column. +#' +#' @rdname column_math_functions +#' @aliases asinh asinh,Column-method +#' @note asinh since 3.1.0 +setMethod("asinh", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "asinh", x@jc) + column(jc) + }) + #' @details #' \code{atan}: Returns the inverse tangent of the given value, #' as if computed by \code{java.lang.Math.atan()} @@ -536,6 +562,19 @@ setMethod("atan", column(jc) }) +#' @details +#' \code{atanh}: Computes inverse hyperbolic tangent of the input column. +#' +#' @rdname column_math_functions +#' @aliases atanh atanh,Column-method +#' @note atanh since 3.1.0 +setMethod("atanh", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "atanh", x@jc) + column(jc) + }) + #' avg #' #' Aggregate function: returns the average of the values in a group. diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 45de1ef1bd3d1..81d4e14df791d 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1430,6 +1430,7 @@ test_that("column functions", { nth_value(column("v"), 3) + nth_value(column("z"), 4L, FALSE) c28 <- asc_nulls_first(c1) + asc_nulls_last(c1) + desc_nulls_first(c1) + desc_nulls_last(c1) + c29 <- acosh(c1) + asinh(c1) + atanh(c1) # Test if base::is.nan() is exposed expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE)) diff --git a/python/docs/source/reference/pyspark.sql.rst b/python/docs/source/reference/pyspark.sql.rst index 3f903fe8c7acd..0dc2f6e55bb96 100644 --- a/python/docs/source/reference/pyspark.sql.rst +++ b/python/docs/source/reference/pyspark.sql.rst @@ -307,6 +307,7 @@ Functions abs acos + acosh add_months aggregate approxCountDistinct @@ -331,8 +332,10 @@ Functions asc_nulls_last ascii asin + asinh assert_true atan + atanh atan2 avg base64 @@ -583,4 +586,3 @@ Grouping GroupedData.pivot GroupedData.sum PandasCogroupedOps.applyInPandas - diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 4af5d1f484ee4..ea91e8593e21f 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -220,6 +220,19 @@ def acos(col): return _invoke_function_over_column("acos", col) +def acosh(col): + """ + Computes inverse hyperbolic cosine of the input column. + + .. versionadded:: 3.1.0 + + Returns + ------- + :class:`Column` + """ + return _invoke_function_over_column("acosh", col) + + def asin(col): """ .. versionadded:: 1.3.0 @@ -233,6 +246,19 @@ def asin(col): return _invoke_function_over_column("asin", col) +def asinh(col): + """ + Computes inverse hyperbolic sine of the input column. + + .. versionadded:: 3.1.0 + + Returns + ------- + :class:`Column` + """ + return _invoke_function_over_column("asinh", col) + + def atan(col): """ .. versionadded:: 1.4.0 @@ -245,6 +271,19 @@ def atan(col): return _invoke_function_over_column("atan", col) +def atanh(col): + """ + Computes inverse hyperbolic tangent of the input column. + + .. versionadded:: 3.1.0 + + Returns + ------- + :class:`Column` + """ + return _invoke_function_over_column("atanh", col) + + @since(1.4) def cbrt(col): """ diff --git a/python/pyspark/sql/functions.pyi b/python/pyspark/sql/functions.pyi index 252f883b5fb09..50e178df9996f 100644 --- a/python/pyspark/sql/functions.pyi +++ b/python/pyspark/sql/functions.pyi @@ -260,12 +260,15 @@ def map_zip_with( ) -> Column: ... def abs(col: ColumnOrName) -> Column: ... def acos(col: ColumnOrName) -> Column: ... +def acosh(col: ColumnOrName) -> Column: ... def asc(col: ColumnOrName) -> Column: ... def asc_nulls_first(col: ColumnOrName) -> Column: ... def asc_nulls_last(col: ColumnOrName) -> Column: ... def ascii(col: ColumnOrName) -> Column: ... def asin(col: ColumnOrName) -> Column: ... +def asinh(col: ColumnOrName) -> Column: ... def atan(col: ColumnOrName) -> Column: ... +def atanh(col: ColumnOrName) -> Column: ... @overload def atan2(col1: ColumnOrName, col2: ColumnOrName) -> Column: ... @overload diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 32549343d938f..2858bdeca0d5a 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -116,6 +116,7 @@ def assert_close(a, b): c = get_values(b) diff = [abs(v - c[k]) < 1e-6 for k, v in enumerate(a)] return sum(diff) == len(a) + assert_close([math.cos(i) for i in range(10)], df.select(functions.cos(df.a)).collect()) assert_close([math.cos(i) for i in range(10)], @@ -139,6 +140,21 @@ def assert_close(a, b): assert_close([math.hypot(i, 2) for i in range(10)], df.select(functions.hypot(df.a, 2)).collect()) + def test_inverse_trig_functions(self): + from pyspark.sql import functions + + funs = [ + (functions.acosh, "ACOSH"), + (functions.asinh, "ASINH"), + (functions.atanh, "ATANH"), + ] + + cols = ["a", functions.col("a")] + + for f, alias in funs: + for c in cols: + self.assertIn(f"{alias}(a)", repr(f(c))) + def test_rand_functions(self): df = self.df from pyspark.sql import functions From 433ae9064f55b8adb27b561e1ff17c32f0bf3465 Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Fri, 27 Nov 2020 15:47:39 +0900 Subject: [PATCH 023/150] [SPARK-33566][CORE][SQL][SS][PYTHON] Make unescapedQuoteHandling option configurable when read CSV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes were proposed in this pull request? There are some differences between Spark CSV, opencsv and commons-csv, the typical case are described in SPARK-33566, When there are both unescaped quotes and unescaped qualifier in value, the results of parsing are different. The reason for the difference is Spark use `STOP_AT_DELIMITER` as default `UnescapedQuoteHandling` to build `CsvParser` and it not configurable. On the other hand, opencsv and commons-csv use the parsing mechanism similar to `STOP_AT_CLOSING_QUOTE ` by default. So this pr make `unescapedQuoteHandling` option configurable to get the same parsing result as opencsv and commons-csv. ### Why are the changes needed? Make unescapedQuoteHandling option configurable when read CSV to make parsing more flexible。 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - Pass the Jenkins or GitHub Action - Add a new case similar to that described in SPARK-33566 Closes #30518 from LuciferYang/SPARK-33566. Authored-by: yangjie01 Signed-off-by: HyukjinKwon --- python/pyspark/sql/readwriter.py | 26 +++++++++++++++++-- python/pyspark/sql/readwriter.pyi | 1 + python/pyspark/sql/streaming.py | 25 ++++++++++++++++-- python/pyspark/sql/streaming.pyi | 1 + .../spark/sql/catalyst/csv/CSVOptions.scala | 8 +++++- .../apache/spark/sql/DataFrameReader.scala | 21 +++++++++++++++ .../sql/streaming/DataStreamReader.scala | 21 +++++++++++++++ .../execution/datasources/csv/CSVSuite.scala | 24 +++++++++++++++++ 8 files changed, 122 insertions(+), 5 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index bb31e6a3e09f8..d120daa5a9434 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -522,7 +522,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, samplingRatio=None, enforceSchema=None, emptyValue=None, locale=None, lineSep=None, - pathGlobFilter=None, recursiveFileLookup=None, modifiedBefore=None, modifiedAfter=None): + pathGlobFilter=None, recursiveFileLookup=None, modifiedBefore=None, modifiedAfter=None, + unescapedQuoteHandling=None): r"""Loads a CSV file and returns the result as a :class:`DataFrame`. This function will go through the input once to determine the input schema if @@ -685,6 +686,26 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non modifiedAfter (batch only) : an optional timestamp to only include files with modification times occurring after the specified time. The provided timestamp must be in the following format: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00) + unescapedQuoteHandling : str, optional + defines how the CsvParser will handle values with unescaped quotes. If None is + set, it uses the default value, ``STOP_AT_DELIMITER``. + + * ``STOP_AT_CLOSING_QUOTE``: If unescaped quotes are found in the input, accumulate + the quote character and proceed parsing the value as a quoted value, until a closing + quote is found. + * ``BACK_TO_DELIMITER``: If unescaped quotes are found in the input, consider the value + as an unquoted value. This will make the parser accumulate all characters of the current + parsed value until the delimiter is found. If no delimiter is found in the value, the + parser will continue accumulating characters from the input until a delimiter or line + ending is found. + * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, consider the value + as an unquoted value. This will make the parser accumulate all characters until the + delimiter or a line ending is found in the input. + * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed + for the given value will be skipped and the value set in nullValue will be produced + instead. + * ``RAISE_ERROR``: If unescaped quotes are found in the input, a TextParsingException + will be thrown. Examples -------- @@ -708,7 +729,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, samplingRatio=samplingRatio, enforceSchema=enforceSchema, emptyValue=emptyValue, locale=locale, lineSep=lineSep, pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup, - modifiedBefore=modifiedBefore, modifiedAfter=modifiedAfter) + modifiedBefore=modifiedBefore, modifiedAfter=modifiedAfter, + unescapedQuoteHandling=unescapedQuoteHandling) if isinstance(path, str): path = [path] if type(path) == list: diff --git a/python/pyspark/sql/readwriter.pyi b/python/pyspark/sql/readwriter.pyi index 64c5697203a44..c3b9a428f22b3 100644 --- a/python/pyspark/sql/readwriter.pyi +++ b/python/pyspark/sql/readwriter.pyi @@ -113,6 +113,7 @@ class DataFrameReader(OptionUtils): lineSep: Optional[str] = ..., pathGlobFilter: Optional[Union[bool, str]] = ..., recursiveFileLookup: Optional[Union[bool, str]] = ..., + unescapedQuoteHandling: Optional[str] = ..., ) -> DataFrame: ... def orc( self, diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index e7b2fa16d620a..365b5f38694a7 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -761,7 +761,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, enforceSchema=None, emptyValue=None, locale=None, lineSep=None, - pathGlobFilter=None, recursiveFileLookup=None): + pathGlobFilter=None, recursiveFileLookup=None, unescapedQuoteHandling=None): r"""Loads a CSV file stream and returns the result as a :class:`DataFrame`. This function will go through the input once to determine the input schema if @@ -900,6 +900,26 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non recursiveFileLookup : str or bool, optional recursively scan a directory for files. Using this option disables `partition discovery `_. # noqa + unescapedQuoteHandling : str, optional + defines how the CsvParser will handle values with unescaped quotes. If None is + set, it uses the default value, ``STOP_AT_DELIMITER``. + + * ``STOP_AT_CLOSING_QUOTE``: If unescaped quotes are found in the input, accumulate + the quote character and proceed parsing the value as a quoted value, until a closing + quote is found. + * ``BACK_TO_DELIMITER``: If unescaped quotes are found in the input, consider the value + as an unquoted value. This will make the parser accumulate all characters of the current + parsed value until the delimiter is found. If no delimiter is found in the value, the + parser will continue accumulating characters from the input until a delimiter or line + ending is found. + * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, consider the value + as an unquoted value. This will make the parser accumulate all characters until the + delimiter or a line ending is found in the input. + * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed + for the given value will be skipped and the value set in nullValue will be produced + instead. + * ``RAISE_ERROR``: If unescaped quotes are found in the input, a TextParsingException + will be thrown. .. versionadded:: 2.0.0 @@ -926,7 +946,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine, charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, enforceSchema=enforceSchema, emptyValue=emptyValue, locale=locale, lineSep=lineSep, - pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup) + pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup, + unescapedQuoteHandling=unescapedQuoteHandling) if isinstance(path, str): return self._df(self._jreader.csv(path)) else: diff --git a/python/pyspark/sql/streaming.pyi b/python/pyspark/sql/streaming.pyi index 56ce140b826d5..829610ad3b94b 100644 --- a/python/pyspark/sql/streaming.pyi +++ b/python/pyspark/sql/streaming.pyi @@ -149,6 +149,7 @@ class DataStreamReader(OptionUtils): lineSep: Optional[str] = ..., pathGlobFilter: Optional[Union[bool, str]] = ..., recursiveFileLookup: Optional[Union[bool, str]] = ..., + unescapedQuoteHandling: Optional[str] = ..., ) -> DataFrame: ... class DataStreamWriter: diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index f2191fcf35f1a..ec405994eadef 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -213,6 +213,12 @@ class CSVOptions( } val lineSeparatorInWrite: Option[String] = lineSeparator + /** + * The handling method to be used when unescaped quotes are found in the input. + */ + val unescapedQuoteHandling: UnescapedQuoteHandling = UnescapedQuoteHandling.valueOf(parameters + .getOrElse("unescapedQuoteHandling", "STOP_AT_DELIMITER").toUpperCase(Locale.ROOT)) + def asWriterSettings: CsvWriterSettings = { val writerSettings = new CsvWriterSettings() val format = writerSettings.getFormat @@ -258,7 +264,7 @@ class CSVOptions( settings.setNullValue(nullValue) settings.setEmptyValue(emptyValueInRead) settings.setMaxCharsPerColumn(maxCharsPerColumn) - settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER) + settings.setUnescapedQuoteHandling(unescapedQuoteHandling) settings.setLineSeparatorDetectionEnabled(lineSeparatorInRead.isEmpty && multiLine) lineSeparatorInRead.foreach { _ => settings.setNormalizeLineEndingsWithinQuotes(!multiLine) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index b26bc6441b6cf..8f96f0b882424 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -727,6 +727,27 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * a record can have. *
  • `maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed * for any given value being read. By default, it is -1 meaning unlimited length
  • + *
  • `unescapedQuoteHandling` (default `STOP_AT_DELIMITER`): defines how the CsvParser + * will handle values with unescaped quotes. + *
      + *
    • `STOP_AT_CLOSING_QUOTE`: If unescaped quotes are found in the input, accumulate + * the quote character and proceed parsing the value as a quoted value, until a closing + * quote is found.
    • + *
    • `BACK_TO_DELIMITER`: If unescaped quotes are found in the input, consider the value + * as an unquoted value. This will make the parser accumulate all characters of the current + * parsed value until the delimiter is found. If no + * delimiter is found in the value, the parser will continue accumulating characters from + * the input until a delimiter or line ending is found.
    • + *
    • `STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value + * as an unquoted value. This will make the parser accumulate all characters until the + * delimiter or a line ending is found in the input.
    • + *
    • `STOP_AT_DELIMITER`: If unescaped quotes are found in the input, the content parsed + * for the given value will be skipped and the value set in nullValue will be produced + * instead.
    • + *
    • `RAISE_ERROR`: If unescaped quotes are found in the input, a TextParsingException + * will be thrown.
    • + *
    + *
  • *
  • `mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records * during parsing. It supports the following case-insensitive modes. Note that Spark tries * to parse only required columns in CSV under column pruning. Therefore, corrupt records diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala index 9bc4acd49a980..7f4ef8be562fb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala @@ -396,6 +396,27 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * a record can have.
  • *
  • `maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed * for any given value being read. By default, it is -1 meaning unlimited length
  • + *
  • `unescapedQuoteHandling` (default `STOP_AT_DELIMITER`): defines how the CsvParser + * will handle values with unescaped quotes. + *
      + *
    • `STOP_AT_CLOSING_QUOTE`: If unescaped quotes are found in the input, accumulate + * the quote character and proceed parsing the value as a quoted value, until a closing + * quote is found.
    • + *
    • `BACK_TO_DELIMITER`: If unescaped quotes are found in the input, consider the value + * as an unquoted value. This will make the parser accumulate all characters of the current + * parsed value until the delimiter is found. If no delimiter is found in the value, the + * parser will continue accumulating characters from the input until a delimiter or line + * ending is found.
    • + *
    • `STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value + * as an unquoted value. This will make the parser accumulate all characters until the + * delimiter or a line ending is found in the input.
    • + *
    • `STOP_AT_DELIMITER`: If unescaped quotes are found in the input, the content parsed + * for the given value will be skipped and the value set in nullValue will be produced + * instead.
    • + *
    • `RAISE_ERROR`: If unescaped quotes are found in the input, a TextParsingException + * will be thrown.
    • + *
    + *
  • *
  • `mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records * during parsing. It supports the following case-insensitive modes. *
      diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index a236814fdcdcd..30f0e45d04eab 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -2428,6 +2428,30 @@ abstract class CSVSuite assert(readback.collect sameElements Array(Row("0"), Row("1"), Row("2"))) } } + + test("SPARK-33566: configure UnescapedQuoteHandling to parse " + + "unescaped quotes and unescaped delimiter data correctly") { + withTempPath { path => + val dataPath = path.getCanonicalPath + val row1 = Row("""a,""b,c""", "xyz") + val row2 = Row("""a,b,c""", """x""yz""") + // Generate the test data, use `,` as delimiter and `"` as quotes, but they didn't escape. + Seq( + """c1,c2""", + s""""${row1.getString(0)}","${row1.getString(1)}"""", + s""""${row2.getString(0)}","${row2.getString(1)}"""") + .toDF().repartition(1).write.text(dataPath) + // Without configure UnescapedQuoteHandling to STOP_AT_CLOSING_QUOTE, + // the result will be Row(""""a,""b""", """c""""), Row("""a,b,c""", """"x""yz"""") + val result = spark.read + .option("inferSchema", "true") + .option("header", "true") + .option("unescapedQuoteHandling", "STOP_AT_CLOSING_QUOTE") + .csv(dataPath).collect() + val exceptResults = Array(row1, row2) + assert(result.sameElements(exceptResults)) + } + } } class CSVv1Suite extends CSVSuite { From 8792280a735598589dc6cbced03262be2b6f8f76 Mon Sep 17 00:00:00 2001 From: Terry Kim Date: Fri, 27 Nov 2020 07:08:24 +0000 Subject: [PATCH 024/150] [SPARK-33575][SQL] Fix misleading exception for "ANALYZE TABLE ... FOR COLUMNS" on temporary views ### What changes were proposed in this pull request? This PR proposes to fix the exception message for `ANALYZE TABLE ... FOR COLUMNS` on temporary views. The current behavior throws `NoSuchTableException` even if the temporary view exists: ``` sql("CREATE TEMP VIEW t AS SELECT 1 AS id") sql("ANALYZE TABLE t COMPUTE STATISTICS FOR COLUMNS id") org.apache.spark.sql.catalyst.analysis.NoSuchTableException: Table or view 't' not found in database 'db'; at org.apache.spark.sql.execution.command.AnalyzeColumnCommand.analyzeColumnInTempView(AnalyzeColumnCommand.scala:76) at org.apache.spark.sql.execution.command.AnalyzeColumnCommand.run(AnalyzeColumnCommand.scala:54) ``` After this PR, more reasonable exception is thrown: ``` org.apache.spark.sql.AnalysisException: Temporary view `testView` is not cached for analyzing columns.; [info] at org.apache.spark.sql.execution.command.AnalyzeColumnCommand.analyzeColumnInTempView(AnalyzeColumnCommand.scala:74) [info] at org.apache.spark.sql.execution.command.AnalyzeColumnCommand.run(AnalyzeColumnCommand.scala:54) ``` ### Why are the changes needed? To fix a misleading exception. ### Does this PR introduce _any_ user-facing change? Yes, the exception thrown is changed as shown above. ### How was this patch tested? Updated existing test. Closes #30519 from imback82/analyze_table_message. Authored-by: Terry Kim Signed-off-by: Wenchen Fan --- .../spark/sql/execution/command/AnalyzeColumnCommand.scala | 5 ++--- .../org/apache/spark/sql/StatisticsCollectionSuite.scala | 5 +++-- .../scala/org/apache/spark/sql/execution/SQLViewSuite.scala | 5 ++++- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala index 5017893077922..3b90f807b3138 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala @@ -71,9 +71,8 @@ case class AnalyzeColumnCommand( private def analyzeColumnInTempView(plan: LogicalPlan, sparkSession: SparkSession): Unit = { if (!analyzeColumnInCachedData(plan, sparkSession)) { - val catalog = sparkSession.sessionState.catalog - val db = tableIdent.database.getOrElse(catalog.getCurrentDatabase) - throw new NoSuchTableException(db = db, table = tableIdent.identifier) + throw new AnalysisException( + s"Temporary view $tableIdent is not cached for analyzing columns.") } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala index 65377594f083c..cd03fadf34b98 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala @@ -526,7 +526,7 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared val errMsg = intercept[AnalysisException] { sql("ANALYZE TABLE tempView COMPUTE STATISTICS FOR COLUMNS id") }.getMessage - assert(errMsg.contains(s"Table or view 'tempView' not found in database 'default'")) + assert(errMsg.contains("Temporary view `tempView` is not cached for analyzing columns")) // Cache the view then analyze it sql("CACHE TABLE tempView") @@ -548,7 +548,8 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared val errMsg2 = intercept[AnalysisException] { sql(s"ANALYZE TABLE $globalTempDB.gTempView COMPUTE STATISTICS FOR COLUMNS id") }.getMessage - assert(errMsg2.contains(s"Table or view 'gTempView' not found in database '$globalTempDB'")) + assert(errMsg2.contains( + s"Temporary view `$globalTempDB`.`gTempView` is not cached for analyzing columns")) // Cache the view then analyze it sql(s"CACHE TABLE $globalTempDB.gTempView") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala index edeebde7db726..5d29503848772 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala @@ -188,7 +188,10 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { sql(s"ANALYZE TABLE $viewName COMPUTE STATISTICS") }.getMessage assert(e5.contains(s"$viewName is a temp view not table or permanent view")) - assertNoSuchTable(s"ANALYZE TABLE $viewName COMPUTE STATISTICS FOR COLUMNS id") + val e6 = intercept[AnalysisException] { + sql(s"ANALYZE TABLE $viewName COMPUTE STATISTICS FOR COLUMNS id") + }.getMessage + assert(e6.contains(s"Temporary view `$viewName` is not cached for analyzing columns.")) } } From 2c41d9d8fa363b62519128819841f39e68429205 Mon Sep 17 00:00:00 2001 From: Terry Kim Date: Fri, 27 Nov 2020 10:16:56 +0000 Subject: [PATCH 025/150] [SPARK-33522][SQL] Improve exception messages while handling UnresolvedTableOrView ### What changes were proposed in this pull request? This PR proposes to improve the exception messages while `UnresolvedTableOrView` is handled based on this suggestion: https://github.com/apache/spark/pull/30321#discussion_r521127001. Currently, when an identifier is resolved to a temp view when a table/permanent view is expected, the following exception message is displayed (e.g., for `SHOW CREATE TABLE`): ``` t is a temp view not table or permanent view. ``` After this PR, the message will be: ``` t is a temp view. 'SHOW CREATE TABLE' expects a table or permanent view. ``` Also, if an identifier is not resolved, the following exception message is currently used: ``` Table or view not found: t ``` After this PR, the message will be: ``` Table or permanent view not found for 'SHOW CREATE TABLE': t ``` or ``` Table or view not found for 'ANALYZE TABLE ... FOR COLUMNS ...': t ``` ### Why are the changes needed? To improve the exception message. ### Does this PR introduce _any_ user-facing change? Yes, the exception message will be changed as described above. ### How was this patch tested? Updated existing tests. Closes #30475 from imback82/unresolved_table_or_view. Authored-by: Terry Kim Signed-off-by: Wenchen Fan --- .../sql/catalyst/analysis/Analyzer.scala | 9 +- .../sql/catalyst/analysis/CheckAnalysis.scala | 4 +- .../catalyst/analysis/v2ResolutionPlans.scala | 1 + .../sql/catalyst/parser/AstBuilder.scala | 31 +++--- .../sql/catalyst/parser/DDLParserSuite.scala | 96 +++++++++++-------- .../sql-tests/results/describe.sql.out | 2 +- .../sql-tests/results/show_columns.sql.out | 8 +- .../spark/sql/ShowCreateTableSuite.scala | 6 +- .../spark/sql/StatisticsCollectionSuite.scala | 3 +- .../sql/connector/DataSourceV2SQLSuite.scala | 2 +- .../spark/sql/execution/SQLViewSuite.scala | 6 +- .../v2/jdbc/JDBCTableCatalogSuite.scala | 6 +- .../sql/hive/execution/HiveCommandSuite.scala | 2 +- 13 files changed, 104 insertions(+), 72 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 837686420375a..77c1dd9ebb7fa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -866,11 +866,12 @@ class Analyzer(override val catalogManager: CatalogManager) u.failAnalysis(s"${ident.quoted} is a temp view. '$cmd' expects a table") } u - case u @ UnresolvedTableOrView(ident, allowTempView) => + case u @ UnresolvedTableOrView(ident, cmd, allowTempView) => lookupTempView(ident) .map { _ => if (!allowTempView) { - u.failAnalysis(s"${ident.quoted} is a temp view not table or permanent view.") + u.failAnalysis( + s"${ident.quoted} is a temp view. '$cmd' expects a table or permanent view.") } ResolvedView(ident.asIdentifier, isTemp = true) } @@ -955,7 +956,7 @@ class Analyzer(override val catalogManager: CatalogManager) .map(ResolvedTable(catalog.asTableCatalog, ident, _)) .getOrElse(u) - case u @ UnresolvedTableOrView(NonSessionCatalogAndIdentifier(catalog, ident), _) => + case u @ UnresolvedTableOrView(NonSessionCatalogAndIdentifier(catalog, ident), _, _) => CatalogV2Util.loadTable(catalog, ident) .map(ResolvedTable(catalog.asTableCatalog, ident, _)) .getOrElse(u) @@ -1085,7 +1086,7 @@ class Analyzer(override val catalogManager: CatalogManager) case table => table }.getOrElse(u) - case u @ UnresolvedTableOrView(identifier, _) => + case u @ UnresolvedTableOrView(identifier, _, _) => lookupTableOrView(identifier).getOrElse(u) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 9998035d65c3f..9a3ab4a5f8d11 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -101,7 +101,9 @@ trait CheckAnalysis extends PredicateHelper { u.failAnalysis(s"Table not found for '${u.commandName}': ${u.multipartIdentifier.quoted}") case u: UnresolvedTableOrView => - u.failAnalysis(s"Table or view not found: ${u.multipartIdentifier.quoted}") + val viewStr = if (u.allowTempView) "view" else "permanent view" + u.failAnalysis( + s"Table or $viewStr not found for '${u.commandName}': ${u.multipartIdentifier.quoted}") case u: UnresolvedRelation => u.failAnalysis(s"Table or view not found: ${u.multipartIdentifier.quoted}") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala index 0e883a88f2691..95fc4f47dec7f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala @@ -51,6 +51,7 @@ case class UnresolvedTable( */ case class UnresolvedTableOrView( multipartIdentifier: Seq[String], + commandName: String, allowTempView: Boolean = true) extends LeafNode { override lazy val resolved: Boolean = false override def output: Seq[Attribute] = Nil diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 606d923061441..4cd9b2bea32a4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -3148,7 +3148,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg override def visitDropTable(ctx: DropTableContext): LogicalPlan = withOrigin(ctx) { // DROP TABLE works with either a table or a temporary view. DropTable( - UnresolvedTableOrView(visitMultipartIdentifier(ctx.multipartIdentifier())), + UnresolvedTableOrView(visitMultipartIdentifier(ctx.multipartIdentifier()), "DROP TABLE"), ctx.EXISTS != null, ctx.PURGE != null) } @@ -3453,12 +3453,15 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg */ override def visitDescribeRelation(ctx: DescribeRelationContext): LogicalPlan = withOrigin(ctx) { val isExtended = ctx.EXTENDED != null || ctx.FORMATTED != null + val relation = UnresolvedTableOrView( + visitMultipartIdentifier(ctx.multipartIdentifier()), + "DESCRIBE TABLE") if (ctx.describeColName != null) { if (ctx.partitionSpec != null) { throw new ParseException("DESC TABLE COLUMN for a specific partition is not supported", ctx) } else { DescribeColumn( - UnresolvedTableOrView(visitMultipartIdentifier(ctx.multipartIdentifier())), + relation, ctx.describeColName.nameParts.asScala.map(_.getText).toSeq, isExtended) } @@ -3473,10 +3476,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } else { Map.empty[String, String] } - DescribeRelation( - UnresolvedTableOrView(visitMultipartIdentifier(ctx.multipartIdentifier())), - partitionSpec, - isExtended) + DescribeRelation(relation, partitionSpec, isExtended) } } @@ -3514,7 +3514,10 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg val tableName = visitMultipartIdentifier(ctx.multipartIdentifier()) if (ctx.ALL() != null) { checkPartitionSpec() - AnalyzeColumn(UnresolvedTableOrView(tableName), None, allColumns = true) + AnalyzeColumn( + UnresolvedTableOrView(tableName, "ANALYZE TABLE ... FOR ALL COLUMNS"), + None, + allColumns = true) } else if (ctx.identifierSeq() == null) { val partitionSpec = if (ctx.partitionSpec != null) { visitPartitionSpec(ctx.partitionSpec) @@ -3522,13 +3525,13 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg Map.empty[String, Option[String]] } AnalyzeTable( - UnresolvedTableOrView(tableName, allowTempView = false), + UnresolvedTableOrView(tableName, "ANALYZE TABLE", allowTempView = false), partitionSpec, noScan = ctx.identifier != null) } else { checkPartitionSpec() AnalyzeColumn( - UnresolvedTableOrView(tableName), + UnresolvedTableOrView(tableName, "ANALYZE TABLE ... FOR COLUMNS ..."), Option(visitIdentifierSeq(ctx.identifierSeq())), allColumns = false) } @@ -3572,6 +3575,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg ShowCreateTable( UnresolvedTableOrView( visitMultipartIdentifier(ctx.multipartIdentifier()), + "SHOW CREATE TABLE", allowTempView = false), ctx.SERDE != null) } @@ -3647,7 +3651,10 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg * }}} */ override def visitRefreshTable(ctx: RefreshTableContext): LogicalPlan = withOrigin(ctx) { - RefreshTable(UnresolvedTableOrView(visitMultipartIdentifier(ctx.multipartIdentifier()))) + RefreshTable( + UnresolvedTableOrView( + visitMultipartIdentifier(ctx.multipartIdentifier()), + "REFRESH TABLE")) } /** @@ -3670,7 +3677,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } else { nameParts } - ShowColumns(UnresolvedTableOrView(tableName), namespace) + ShowColumns(UnresolvedTableOrView(tableName, "SHOW COLUMNS"), namespace) } /** @@ -3881,7 +3888,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg override def visitShowTblProperties( ctx: ShowTblPropertiesContext): LogicalPlan = withOrigin(ctx) { ShowTableProperties( - UnresolvedTableOrView(visitMultipartIdentifier(ctx.table)), + UnresolvedTableOrView(visitMultipartIdentifier(ctx.table), "SHOW TBLPROPERTIES"), Option(ctx.key).map(visitTablePropertyKey)) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala index f650922e75f6e..c58ff81f17131 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala @@ -697,27 +697,27 @@ class DDLParserSuite extends AnalysisTest { test("drop table") { parseCompare("DROP TABLE testcat.ns1.ns2.tbl", DropTable( - UnresolvedTableOrView(Seq("testcat", "ns1", "ns2", "tbl")), + UnresolvedTableOrView(Seq("testcat", "ns1", "ns2", "tbl"), "DROP TABLE"), ifExists = false, purge = false)) parseCompare(s"DROP TABLE db.tab", DropTable( - UnresolvedTableOrView(Seq("db", "tab")), ifExists = false, purge = false)) + UnresolvedTableOrView(Seq("db", "tab"), "DROP TABLE"), ifExists = false, purge = false)) parseCompare(s"DROP TABLE IF EXISTS db.tab", DropTable( - UnresolvedTableOrView(Seq("db", "tab")), ifExists = true, purge = false)) + UnresolvedTableOrView(Seq("db", "tab"), "DROP TABLE"), ifExists = true, purge = false)) parseCompare(s"DROP TABLE tab", DropTable( - UnresolvedTableOrView(Seq("tab")), ifExists = false, purge = false)) + UnresolvedTableOrView(Seq("tab"), "DROP TABLE"), ifExists = false, purge = false)) parseCompare(s"DROP TABLE IF EXISTS tab", DropTable( - UnresolvedTableOrView(Seq("tab")), ifExists = true, purge = false)) + UnresolvedTableOrView(Seq("tab"), "DROP TABLE"), ifExists = true, purge = false)) parseCompare(s"DROP TABLE tab PURGE", DropTable( - UnresolvedTableOrView(Seq("tab")), ifExists = false, purge = true)) + UnresolvedTableOrView(Seq("tab"), "DROP TABLE"), ifExists = false, purge = true)) parseCompare(s"DROP TABLE IF EXISTS tab PURGE", DropTable( - UnresolvedTableOrView(Seq("tab")), ifExists = true, purge = true)) + UnresolvedTableOrView(Seq("tab"), "DROP TABLE"), ifExists = true, purge = true)) } test("drop view") { @@ -1112,26 +1112,26 @@ class DDLParserSuite extends AnalysisTest { test("describe table column") { comparePlans(parsePlan("DESCRIBE t col"), DescribeColumn( - UnresolvedTableOrView(Seq("t")), Seq("col"), isExtended = false)) + UnresolvedTableOrView(Seq("t"), "DESCRIBE TABLE"), Seq("col"), isExtended = false)) comparePlans(parsePlan("DESCRIBE t `abc.xyz`"), DescribeColumn( - UnresolvedTableOrView(Seq("t")), Seq("abc.xyz"), isExtended = false)) + UnresolvedTableOrView(Seq("t"), "DESCRIBE TABLE"), Seq("abc.xyz"), isExtended = false)) comparePlans(parsePlan("DESCRIBE t abc.xyz"), DescribeColumn( - UnresolvedTableOrView(Seq("t")), Seq("abc", "xyz"), isExtended = false)) + UnresolvedTableOrView(Seq("t"), "DESCRIBE TABLE"), Seq("abc", "xyz"), isExtended = false)) comparePlans(parsePlan("DESCRIBE t `a.b`.`x.y`"), DescribeColumn( - UnresolvedTableOrView(Seq("t")), Seq("a.b", "x.y"), isExtended = false)) + UnresolvedTableOrView(Seq("t"), "DESCRIBE TABLE"), Seq("a.b", "x.y"), isExtended = false)) comparePlans(parsePlan("DESCRIBE TABLE t col"), DescribeColumn( - UnresolvedTableOrView(Seq("t")), Seq("col"), isExtended = false)) + UnresolvedTableOrView(Seq("t"), "DESCRIBE TABLE"), Seq("col"), isExtended = false)) comparePlans(parsePlan("DESCRIBE TABLE EXTENDED t col"), DescribeColumn( - UnresolvedTableOrView(Seq("t")), Seq("col"), isExtended = true)) + UnresolvedTableOrView(Seq("t"), "DESCRIBE TABLE"), Seq("col"), isExtended = true)) comparePlans(parsePlan("DESCRIBE TABLE FORMATTED t col"), DescribeColumn( - UnresolvedTableOrView(Seq("t")), Seq("col"), isExtended = true)) + UnresolvedTableOrView(Seq("t"), "DESCRIBE TABLE"), Seq("col"), isExtended = true)) val caught = intercept[AnalysisException]( parsePlan("DESCRIBE TABLE t PARTITION (ds='1970-01-01') col")) @@ -1150,13 +1150,17 @@ class DDLParserSuite extends AnalysisTest { test("SPARK-17328 Fix NPE with EXPLAIN DESCRIBE TABLE") { comparePlans(parsePlan("describe t"), - DescribeRelation(UnresolvedTableOrView(Seq("t")), Map.empty, isExtended = false)) + DescribeRelation( + UnresolvedTableOrView(Seq("t"), "DESCRIBE TABLE"), Map.empty, isExtended = false)) comparePlans(parsePlan("describe table t"), - DescribeRelation(UnresolvedTableOrView(Seq("t")), Map.empty, isExtended = false)) + DescribeRelation( + UnresolvedTableOrView(Seq("t"), "DESCRIBE TABLE"), Map.empty, isExtended = false)) comparePlans(parsePlan("describe table extended t"), - DescribeRelation(UnresolvedTableOrView(Seq("t")), Map.empty, isExtended = true)) + DescribeRelation( + UnresolvedTableOrView(Seq("t"), "DESCRIBE TABLE"), Map.empty, isExtended = true)) comparePlans(parsePlan("describe table formatted t"), - DescribeRelation(UnresolvedTableOrView(Seq("t")), Map.empty, isExtended = true)) + DescribeRelation( + UnresolvedTableOrView(Seq("t"), "DESCRIBE TABLE"), Map.empty, isExtended = true)) } test("insert table: basic append") { @@ -1769,57 +1773,57 @@ class DDLParserSuite extends AnalysisTest { test("analyze table statistics") { comparePlans(parsePlan("analyze table a.b.c compute statistics"), AnalyzeTable( - UnresolvedTableOrView(Seq("a", "b", "c"), allowTempView = false), + UnresolvedTableOrView(Seq("a", "b", "c"), "ANALYZE TABLE", allowTempView = false), Map.empty, noScan = false)) comparePlans(parsePlan("analyze table a.b.c compute statistics noscan"), AnalyzeTable( - UnresolvedTableOrView(Seq("a", "b", "c"), allowTempView = false), + UnresolvedTableOrView(Seq("a", "b", "c"), "ANALYZE TABLE", allowTempView = false), Map.empty, noScan = true)) comparePlans(parsePlan("analyze table a.b.c partition (a) compute statistics nOscAn"), AnalyzeTable( - UnresolvedTableOrView(Seq("a", "b", "c"), allowTempView = false), + UnresolvedTableOrView(Seq("a", "b", "c"), "ANALYZE TABLE", allowTempView = false), Map("a" -> None), noScan = true)) // Partitions specified comparePlans( parsePlan("ANALYZE TABLE a.b.c PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS"), AnalyzeTable( - UnresolvedTableOrView(Seq("a", "b", "c"), allowTempView = false), + UnresolvedTableOrView(Seq("a", "b", "c"), "ANALYZE TABLE", allowTempView = false), Map("ds" -> Some("2008-04-09"), "hr" -> Some("11")), noScan = false)) comparePlans( parsePlan("ANALYZE TABLE a.b.c PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS noscan"), AnalyzeTable( - UnresolvedTableOrView(Seq("a", "b", "c"), allowTempView = false), + UnresolvedTableOrView(Seq("a", "b", "c"), "ANALYZE TABLE", allowTempView = false), Map("ds" -> Some("2008-04-09"), "hr" -> Some("11")), noScan = true)) comparePlans( parsePlan("ANALYZE TABLE a.b.c PARTITION(ds='2008-04-09') COMPUTE STATISTICS noscan"), AnalyzeTable( - UnresolvedTableOrView(Seq("a", "b", "c"), allowTempView = false), + UnresolvedTableOrView(Seq("a", "b", "c"), "ANALYZE TABLE", allowTempView = false), Map("ds" -> Some("2008-04-09")), noScan = true)) comparePlans( parsePlan("ANALYZE TABLE a.b.c PARTITION(ds='2008-04-09', hr) COMPUTE STATISTICS"), AnalyzeTable( - UnresolvedTableOrView(Seq("a", "b", "c"), allowTempView = false), + UnresolvedTableOrView(Seq("a", "b", "c"), "ANALYZE TABLE", allowTempView = false), Map("ds" -> Some("2008-04-09"), "hr" -> None), noScan = false)) comparePlans( parsePlan("ANALYZE TABLE a.b.c PARTITION(ds='2008-04-09', hr) COMPUTE STATISTICS noscan"), AnalyzeTable( - UnresolvedTableOrView(Seq("a", "b", "c"), allowTempView = false), + UnresolvedTableOrView(Seq("a", "b", "c"), "ANALYZE TABLE", allowTempView = false), Map("ds" -> Some("2008-04-09"), "hr" -> None), noScan = true)) comparePlans( parsePlan("ANALYZE TABLE a.b.c PARTITION(ds, hr=11) COMPUTE STATISTICS noscan"), AnalyzeTable( - UnresolvedTableOrView(Seq("a", "b", "c"), allowTempView = false), + UnresolvedTableOrView(Seq("a", "b", "c"), "ANALYZE TABLE", allowTempView = false), Map("ds" -> None, "hr" -> Some("11")), noScan = true)) comparePlans( parsePlan("ANALYZE TABLE a.b.c PARTITION(ds, hr) COMPUTE STATISTICS"), AnalyzeTable( - UnresolvedTableOrView(Seq("a", "b", "c"), allowTempView = false), + UnresolvedTableOrView(Seq("a", "b", "c"), "ANALYZE TABLE", allowTempView = false), Map("ds" -> None, "hr" -> None), noScan = false)) comparePlans( parsePlan("ANALYZE TABLE a.b.c PARTITION(ds, hr) COMPUTE STATISTICS noscan"), AnalyzeTable( - UnresolvedTableOrView(Seq("a", "b", "c"), allowTempView = false), + UnresolvedTableOrView(Seq("a", "b", "c"), "ANALYZE TABLE", allowTempView = false), Map("ds" -> None, "hr" -> None), noScan = true)) intercept("analyze table a.b.c compute statistics xxxx", @@ -1834,7 +1838,9 @@ class DDLParserSuite extends AnalysisTest { comparePlans( parsePlan("ANALYZE TABLE a.b.c COMPUTE STATISTICS FOR COLUMNS key, value"), AnalyzeColumn( - UnresolvedTableOrView(Seq("a", "b", "c")), Option(Seq("key", "value")), allColumns = false)) + UnresolvedTableOrView(Seq("a", "b", "c"), "ANALYZE TABLE ... FOR COLUMNS ..."), + Option(Seq("key", "value")), + allColumns = false)) // Partition specified - should be ignored comparePlans( @@ -1844,7 +1850,9 @@ class DDLParserSuite extends AnalysisTest { |COMPUTE STATISTICS FOR COLUMNS key, value """.stripMargin), AnalyzeColumn( - UnresolvedTableOrView(Seq("a", "b", "c")), Option(Seq("key", "value")), allColumns = false)) + UnresolvedTableOrView(Seq("a", "b", "c"), "ANALYZE TABLE ... FOR COLUMNS ..."), + Option(Seq("key", "value")), + allColumns = false)) // Partition specified should be ignored in case of COMPUTE STATISTICS FOR ALL COLUMNS comparePlans( @@ -1854,7 +1862,9 @@ class DDLParserSuite extends AnalysisTest { |COMPUTE STATISTICS FOR ALL COLUMNS """.stripMargin), AnalyzeColumn( - UnresolvedTableOrView(Seq("a", "b", "c")), None, allColumns = true)) + UnresolvedTableOrView(Seq("a", "b", "c"), "ANALYZE TABLE ... FOR ALL COLUMNS"), + None, + allColumns = true)) intercept("ANALYZE TABLE a.b.c COMPUTE STATISTICS FOR ALL COLUMNS key, value", "mismatched input 'key' expecting {, ';'}") @@ -1898,12 +1908,13 @@ class DDLParserSuite extends AnalysisTest { test("SHOW CREATE table") { comparePlans( parsePlan("SHOW CREATE TABLE a.b.c"), - ShowCreateTable(UnresolvedTableOrView(Seq("a", "b", "c"), allowTempView = false))) + ShowCreateTable( + UnresolvedTableOrView(Seq("a", "b", "c"), "SHOW CREATE TABLE", allowTempView = false))) comparePlans( parsePlan("SHOW CREATE TABLE a.b.c AS SERDE"), ShowCreateTable( - UnresolvedTableOrView(Seq("a", "b", "c"), allowTempView = false), + UnresolvedTableOrView(Seq("a", "b", "c"), "SHOW CREATE TABLE", allowTempView = false), asSerde = true)) } @@ -1949,7 +1960,7 @@ class DDLParserSuite extends AnalysisTest { test("REFRESH TABLE") { comparePlans( parsePlan("REFRESH TABLE a.b.c"), - RefreshTable(UnresolvedTableOrView(Seq("a", "b", "c")))) + RefreshTable(UnresolvedTableOrView(Seq("a", "b", "c"), "REFRESH TABLE"))) } test("show columns") { @@ -1959,13 +1970,15 @@ class DDLParserSuite extends AnalysisTest { val sql4 = "SHOW COLUMNS FROM db1.t1 IN db1" val parsed1 = parsePlan(sql1) - val expected1 = ShowColumns(UnresolvedTableOrView(Seq("t1")), None) + val expected1 = ShowColumns(UnresolvedTableOrView(Seq("t1"), "SHOW COLUMNS"), None) val parsed2 = parsePlan(sql2) - val expected2 = ShowColumns(UnresolvedTableOrView(Seq("db1", "t1")), None) + val expected2 = ShowColumns(UnresolvedTableOrView(Seq("db1", "t1"), "SHOW COLUMNS"), None) val parsed3 = parsePlan(sql3) - val expected3 = ShowColumns(UnresolvedTableOrView(Seq("db1", "t1")), Some(Seq("db1"))) + val expected3 = + ShowColumns(UnresolvedTableOrView(Seq("db1", "t1"), "SHOW COLUMNS"), Some(Seq("db1"))) val parsed4 = parsePlan(sql4) - val expected4 = ShowColumns(UnresolvedTableOrView(Seq("db1", "t1")), Some(Seq("db1"))) + val expected4 = + ShowColumns(UnresolvedTableOrView(Seq("db1", "t1"), "SHOW COLUMNS"), Some(Seq("db1"))) comparePlans(parsed1, expected1) comparePlans(parsed2, expected2) @@ -2300,11 +2313,12 @@ class DDLParserSuite extends AnalysisTest { test("SHOW TBLPROPERTIES table") { comparePlans( parsePlan("SHOW TBLPROPERTIES a.b.c"), - ShowTableProperties(UnresolvedTableOrView(Seq("a", "b", "c")), None)) + ShowTableProperties(UnresolvedTableOrView(Seq("a", "b", "c"), "SHOW TBLPROPERTIES"), None)) comparePlans( parsePlan("SHOW TBLPROPERTIES a.b.c('propKey1')"), - ShowTableProperties(UnresolvedTableOrView(Seq("a", "b", "c")), Some("propKey1"))) + ShowTableProperties( + UnresolvedTableOrView(Seq("a", "b", "c"), "SHOW TBLPROPERTIES"), Some("propKey1"))) } test("DESCRIBE FUNCTION") { diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out index 07aed98d120f9..145c987ee5f61 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out @@ -540,7 +540,7 @@ struct -- !query output == Parsed Logical Plan == 'DescribeRelation false -+- 'UnresolvedTableOrView [t], true ++- 'UnresolvedTableOrView [t], DESCRIBE TABLE, true == Analyzed Logical Plan == col_name: string, data_type: string, comment: string diff --git a/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out b/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out index 6ddffb89987d8..03df876133aa4 100644 --- a/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out @@ -94,7 +94,7 @@ SHOW COLUMNS IN badtable FROM showdb struct<> -- !query output org.apache.spark.sql.AnalysisException -Table or view not found: showdb.badtable; line 1 pos 0 +Table or view not found for 'SHOW COLUMNS': showdb.badtable; line 1 pos 0 -- !query @@ -130,7 +130,7 @@ SHOW COLUMNS IN showdb.showcolumn3 struct<> -- !query output org.apache.spark.sql.AnalysisException -Table or view not found: showdb.showcolumn3; line 1 pos 0 +Table or view not found for 'SHOW COLUMNS': showdb.showcolumn3; line 1 pos 0 -- !query @@ -139,7 +139,7 @@ SHOW COLUMNS IN showcolumn3 FROM showdb struct<> -- !query output org.apache.spark.sql.AnalysisException -Table or view not found: showdb.showcolumn3; line 1 pos 0 +Table or view not found for 'SHOW COLUMNS': showdb.showcolumn3; line 1 pos 0 -- !query @@ -148,7 +148,7 @@ SHOW COLUMNS IN showcolumn4 struct<> -- !query output org.apache.spark.sql.AnalysisException -Table or view not found: showcolumn4; line 1 pos 0 +Table or view not found for 'SHOW COLUMNS': showcolumn4; line 1 pos 0 -- !query diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala index 7b4c8d1cc71d8..92d306c0e3c11 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala @@ -155,7 +155,8 @@ abstract class ShowCreateTableSuite extends QueryTest with SQLTestUtils { val ex = intercept[AnalysisException] { sql(s"SHOW CREATE TABLE $viewName") } - assert(ex.getMessage.contains(s"$viewName is a temp view not table or permanent view")) + assert(ex.getMessage.contains( + s"$viewName is a temp view. 'SHOW CREATE TABLE' expects a table or permanent view.")) } withGlobalTempView(viewName) { @@ -165,7 +166,8 @@ abstract class ShowCreateTableSuite extends QueryTest with SQLTestUtils { sql(s"SHOW CREATE TABLE $globalTempViewDb.$viewName") } assert(ex.getMessage.contains( - s"$globalTempViewDb.$viewName is a temp view not table or permanent view")) + s"$globalTempViewDb.$viewName is a temp view. " + + "'SHOW CREATE TABLE' expects a table or permanent view.")) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala index cd03fadf34b98..3fc679f6b9fc7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala @@ -542,7 +542,8 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared val errMsg1 = intercept[AnalysisException] { sql(s"ANALYZE TABLE $globalTempDB.gTempView COMPUTE STATISTICS FOR COLUMNS id") }.getMessage - assert(errMsg1.contains(s"Table or view not found: $globalTempDB.gTempView")) + assert(errMsg1.contains("Table or view not found for 'ANALYZE TABLE ... FOR COLUMNS ...': " + + s"$globalTempDB.gTempView")) // Analyzes in a global temporary view sql("CREATE GLOBAL TEMP VIEW gTempView AS SELECT * FROM range(1, 30)") val errMsg2 = intercept[AnalysisException] { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index f2b57f9442d09..98580568a8df6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -729,7 +729,7 @@ class DataSourceV2SQLSuite val ex = intercept[AnalysisException] { sql("DROP TABLE testcat.db.notbl") } - assert(ex.getMessage.contains("Table or view not found: testcat.db.notbl")) + assert(ex.getMessage.contains("Table or view not found for 'DROP TABLE': testcat.db.notbl")) sql("DROP TABLE IF EXISTS testcat.db.notbl") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala index 5d29503848772..d776198bc7470 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala @@ -183,11 +183,13 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { val e4 = intercept[AnalysisException] { sql(s"SHOW CREATE TABLE $viewName") }.getMessage - assert(e4.contains(s"$viewName is a temp view not table or permanent view")) + assert(e4.contains( + s"$viewName is a temp view. 'SHOW CREATE TABLE' expects a table or permanent view.")) val e5 = intercept[AnalysisException] { sql(s"ANALYZE TABLE $viewName COMPUTE STATISTICS") }.getMessage - assert(e5.contains(s"$viewName is a temp view not table or permanent view")) + assert(e5.contains( + s"$viewName is a temp view. 'ANALYZE TABLE' expects a table or permanent view.")) val e6 = intercept[AnalysisException] { sql(s"ANALYZE TABLE $viewName COMPUTE STATISTICS FOR COLUMNS id") }.getMessage diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala index c7ad96c8f7619..97dd92acc7805 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala @@ -80,8 +80,10 @@ class JDBCTableCatalogSuite extends QueryTest with SharedSparkSession { sql("DROP TABLE h2.test.to_drop") checkAnswer(sql("SHOW TABLES IN h2.test"), Seq(Row("test", "people"))) Seq( - "h2.test.not_existing_table" -> "Table or view not found: h2.test.not_existing_table", - "h2.bad_test.not_existing_table" -> "Table or view not found: h2.bad_test.not_existing_table" + "h2.test.not_existing_table" -> + "Table or view not found for 'DROP TABLE': h2.test.not_existing_table", + "h2.bad_test.not_existing_table" -> + "Table or view not found for 'DROP TABLE': h2.bad_test.not_existing_table" ).foreach { case (table, expectedMsg) => val msg = intercept[AnalysisException] { sql(s"DROP TABLE $table") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala index d3398842afb21..4feb970ea6f1a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala @@ -137,7 +137,7 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto val message = intercept[AnalysisException] { sql("SHOW TBLPROPERTIES badtable") }.getMessage - assert(message.contains("Table or view not found: badtable")) + assert(message.contains("Table or view not found for 'SHOW TBLPROPERTIES': badtable")) // When key is not found, a row containing the error is returned. checkAnswer( From e43255051c0a82713d653fe590fe7728e43556ce Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Fri, 27 Nov 2020 10:27:08 +0000 Subject: [PATCH 026/150] [SPARK-28645][SQL] ParseException is thrown when the window is redefined ### What changes were proposed in this pull request? Currently in Spark one could redefine a window. For instance: `select count(*) OVER w FROM tenk1 WINDOW w AS (ORDER BY unique1), w AS (ORDER BY unique1);` The window `w` is defined two times. In PgSQL, on the other hand, a thrown will happen: `ERROR: window "w" is already defined` ### Why are the changes needed? The current implement gives the following window definitions a higher priority. But it wasn't Spark's intention and users can't know from any document of Spark. This PR fixes the bug. ### Does this PR introduce _any_ user-facing change? Yes. There is an example query output with/without this fix. ``` SELECT employee_name, salary, first_value(employee_name) OVER w highest_salary, nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays WINDOW w AS (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING), w AS (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 2 FOLLOWING) ORDER BY salary DESC ``` The output before this fix: ``` Larry Bott 11798 Larry Bott Gerard Bondur Gerard Bondur 11472 Larry Bott Gerard Bondur Pamela Castillo 11303 Larry Bott Gerard Bondur Barry Jones 10586 Larry Bott Gerard Bondur George Vanauf 10563 Larry Bott Gerard Bondur Loui Bondur 10449 Larry Bott Gerard Bondur Mary Patterson 9998 Larry Bott Gerard Bondur Steve Patterson 9441 Larry Bott Gerard Bondur Julie Firrelli 9181 Larry Bott Gerard Bondur Jeff Firrelli 8992 Larry Bott Gerard Bondur William Patterson 8870 Larry Bott Gerard Bondur Diane Murphy 8435 Larry Bott Gerard Bondur Leslie Jennings 8113 Larry Bott Gerard Bondur Gerard Hernandez 6949 Larry Bott Gerard Bondur Foon Yue Tseng 6660 Larry Bott Gerard Bondur Anthony Bow 6627 Larry Bott Gerard Bondur Leslie Thompson 5186 Larry Bott Gerard Bondur ``` The output after this fix: ``` struct<> -- !query output org.apache.spark.sql.catalyst.parser.ParseException The definition of window 'w' is repetitive(line 8, pos 0) ``` ### How was this patch tested? Jenkins test. Closes #30512 from beliefer/SPARK-28645. Lead-authored-by: gengjiaan Co-authored-by: beliefer Signed-off-by: Wenchen Fan --- .../sql/catalyst/parser/AstBuilder.scala | 10 ++++- .../resources/sql-tests/inputs/window.sql | 14 ++++++- .../sql-tests/results/window.sql.out | 38 ++++++++++++++++++- 3 files changed, 57 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 4cd9b2bea32a4..afef88f7e97e8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -815,10 +815,16 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg ctx: WindowClauseContext, query: LogicalPlan): LogicalPlan = withOrigin(ctx) { // Collect all window specifications defined in the WINDOW clause. - val baseWindowMap = ctx.namedWindow.asScala.map { + val baseWindowTuples = ctx.namedWindow.asScala.map { wCtx => (wCtx.name.getText, typedVisit[WindowSpec](wCtx.windowSpec)) - }.toMap + } + baseWindowTuples.groupBy(_._1).foreach { kv => + if (kv._2.size > 1) { + throw new ParseException(s"The definition of window '${kv._1}' is repetitive", ctx) + } + } + val baseWindowMap = baseWindowTuples.toMap // Handle cases like // window w1 as (partition by p_mfgr order by p_name diff --git a/sql/core/src/test/resources/sql-tests/inputs/window.sql b/sql/core/src/test/resources/sql-tests/inputs/window.sql index f5223af9125f6..f0336d764bdea 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/window.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/window.sql @@ -250,4 +250,16 @@ WINDOW w AS ( ORDER BY salary DESC RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) -ORDER BY department; \ No newline at end of file +ORDER BY department; + +SELECT + employee_name, + salary, + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary +FROM + basic_pays +WINDOW + w AS (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING), + w AS (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 2 FOLLOWING) +ORDER BY salary DESC; \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/results/window.sql.out b/sql/core/src/test/resources/sql-tests/results/window.sql.out index 1304dcf21d0b3..df2ad96649186 100644 --- a/sql/core/src/test/resources/sql-tests/results/window.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/window.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 35 +-- Number of queries: 36 -- !query @@ -739,4 +739,38 @@ Gerard Hernandez SCM 6949 Larry Bott Pamela Castillo George Vanauf Sales 10563 George Vanauf Steve Patterson Steve Patterson Sales 9441 George Vanauf Steve Patterson Julie Firrelli Sales 9181 George Vanauf Steve Patterson -Foon Yue Tseng Sales 6660 George Vanauf Steve Patterson \ No newline at end of file +Foon Yue Tseng Sales 6660 George Vanauf Steve Patterson + + +-- !query +SELECT + employee_name, + salary, + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary +FROM + basic_pays +WINDOW + w AS (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING), + w AS (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 2 FOLLOWING) +ORDER BY salary DESC +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +The definition of window 'w' is repetitive(line 8, pos 0) + +== SQL == +SELECT + employee_name, + salary, + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary +FROM + basic_pays +WINDOW +^^^ + w AS (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING), + w AS (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 2 FOLLOWING) +ORDER BY salary DESC \ No newline at end of file From b9f2f78de59758d1932c1573338539e485a01112 Mon Sep 17 00:00:00 2001 From: "xuewei.linxuewei" Date: Fri, 27 Nov 2020 13:24:11 +0000 Subject: [PATCH 027/150] [SPARK-33498][SQL] Datetime parsing should fail if the input string can't be parsed, or the pattern string is invalid ### What changes were proposed in this pull request? Datetime parsing should fail if the input string can't be parsed, or the pattern string is invalid, when ANSI mode is enable. This patch should update GetTimeStamp, UnixTimeStamp, ToUnixTimeStamp and Cast. ### Why are the changes needed? For ANSI mode. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Added UT and Existing UT. Closes #30442 from leanken/leanken-SPARK-33498. Authored-by: xuewei.linxuewei Signed-off-by: Wenchen Fan --- docs/sql-ref-ansi-compliance.md | 5 + .../spark/sql/catalyst/expressions/Cast.scala | 33 +++-- .../expressions/datetimeExpressions.scala | 51 +++++--- .../sql/catalyst/util/DateTimeUtils.scala | 9 ++ .../sql/catalyst/expressions/CastSuite.scala | 41 ++++-- .../expressions/DateExpressionsSuite.scala | 59 ++++++++- .../resources/sql-tests/inputs/datetime.sql | 11 ++ .../sql-tests/results/ansi/datetime.sql.out | 123 +++++++++++++++--- .../sql-tests/results/datetime-legacy.sql.out | 74 ++++++++++- .../sql-tests/results/datetime.sql.out | 74 ++++++++++- .../results/postgreSQL/window_part3.sql.out | 3 +- 11 files changed, 424 insertions(+), 59 deletions(-) diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index 870ed0aa0daaa..4e19799ca75b9 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -136,12 +136,17 @@ The behavior of some SQL functions can be different under ANSI mode (`spark.sql. - `element_at`: This function throws `NoSuchElementException` if key does not exist in map. - `elt`: This function throws `ArrayIndexOutOfBoundsException` if using invalid indices. - `parse_url`: This function throws `IllegalArgumentException` if an input string is not a valid url. + - `to_date` This function should fail with an exception if the input string can't be parsed, or the pattern string is invalid. + - `to_timestamp` This function should fail with an exception if the input string can't be parsed, or the pattern string is invalid. + - `unix_timestamp` This function should fail with an exception if the input string can't be parsed, or the pattern string is invalid. + - `to_unix_timestamp` This function should fail with an exception if the input string can't be parsed, or the pattern string is invalid. ### SQL Operators The behavior of some SQL operators can be different under ANSI mode (`spark.sql.ansi.enabled=true`). - `array_col[index]`: This operator throws `ArrayIndexOutOfBoundsException` if using invalid indices. - `map_col[key]`: This operator throws `NoSuchElementException` if key does not exist in map. + - `CAST(string_col AS TIMESTAMP)`: This operator should fail with an exception if the input string can't be parsed. ### SQL Keywords diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index e6f585cacc6c7..95f09d64c484b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -448,7 +448,13 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit // TimestampConverter private[this] def castToTimestamp(from: DataType): Any => Any = from match { case StringType => - buildCast[UTF8String](_, utfs => DateTimeUtils.stringToTimestamp(utfs, zoneId).orNull) + buildCast[UTF8String](_, utfs => { + if (ansiEnabled) { + DateTimeUtils.stringToTimestampAnsi(utfs, zoneId) + } else { + DateTimeUtils.stringToTimestamp(utfs, zoneId).orNull + } + }) case BooleanType => buildCast[Boolean](_, b => if (b) 1L else 0) case LongType => @@ -1250,15 +1256,22 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit zoneIdClass) val longOpt = ctx.freshVariable("longOpt", classOf[Option[Long]]) (c, evPrim, evNull) => - code""" - scala.Option $longOpt = - org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToTimestamp($c, $zid); - if ($longOpt.isDefined()) { - $evPrim = ((Long) $longOpt.get()).longValue(); - } else { - $evNull = true; - } - """ + if (ansiEnabled) { + code""" + $evPrim = + org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToTimestampAnsi($c, $zid); + """ + } else { + code""" + scala.Option $longOpt = + org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToTimestamp($c, $zid); + if ($longOpt.isDefined()) { + $evPrim = ((Long) $longOpt.get()).longValue(); + } else { + $evNull = true; + } + """ + } case BooleanType => (c, evPrim, evNull) => code"$evPrim = $c ? 1L : 0L;" case _: IntegralType => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 9953b780ceace..1ff5833fb4dd6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -720,10 +720,12 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti case class ToUnixTimestamp( timeExp: Expression, format: Expression, - timeZoneId: Option[String] = None) + timeZoneId: Option[String] = None, + failOnError: Boolean = SQLConf.get.ansiEnabled) extends UnixTime { - def this(timeExp: Expression, format: Expression) = this(timeExp, format, None) + def this(timeExp: Expression, format: Expression) = + this(timeExp, format, None, SQLConf.get.ansiEnabled) override def left: Expression = timeExp override def right: Expression = format @@ -767,10 +769,15 @@ case class ToUnixTimestamp( group = "datetime_funcs", since = "1.5.0") // scalastyle:on line.size.limit -case class UnixTimestamp(timeExp: Expression, format: Expression, timeZoneId: Option[String] = None) +case class UnixTimestamp( + timeExp: Expression, + format: Expression, + timeZoneId: Option[String] = None, + failOnError: Boolean = SQLConf.get.ansiEnabled) extends UnixTime { - def this(timeExp: Expression, format: Expression) = this(timeExp, format, None) + def this(timeExp: Expression, format: Expression) = + this(timeExp, format, None, SQLConf.get.ansiEnabled) override def left: Expression = timeExp override def right: Expression = format @@ -792,6 +799,8 @@ case class UnixTimestamp(timeExp: Expression, format: Expression, timeZoneId: Op abstract class ToTimestamp extends BinaryExpression with TimestampFormatterHelper with ExpectsInputTypes { + def failOnError: Boolean + // The result of the conversion to timestamp is microseconds divided by this factor. // For example if the factor is 1000000, the result of the expression is in seconds. protected def downScaleFactor: Long @@ -803,7 +812,14 @@ abstract class ToTimestamp Seq(TypeCollection(StringType, DateType, TimestampType), StringType) override def dataType: DataType = LongType - override def nullable: Boolean = true + override def nullable: Boolean = if (failOnError) children.exists(_.nullable) else true + + private def isParseError(e: Throwable): Boolean = e match { + case _: DateTimeParseException | + _: DateTimeException | + _: ParseException => true + case _ => false + } override def eval(input: InternalRow): Any = { val t = left.eval(input) @@ -824,9 +840,12 @@ abstract class ToTimestamp try { formatter.parse(t.asInstanceOf[UTF8String].toString) / downScaleFactor } catch { - case _: DateTimeParseException | - _: DateTimeException | - _: ParseException => null + case e if isParseError(e) => + if (failOnError) { + throw e + } else { + null + } } } } @@ -835,6 +854,7 @@ abstract class ToTimestamp override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val javaType = CodeGenerator.javaType(dataType) + val parseErrorBranch = if (failOnError) "throw e;" else s"${ev.isNull} = true;" left.dataType match { case StringType => formatterOption.map { fmt => val df = classOf[TimestampFormatter].getName @@ -844,11 +864,11 @@ abstract class ToTimestamp |try { | ${ev.value} = $formatterName.parse($datetimeStr.toString()) / $downScaleFactor; |} catch (java.time.DateTimeException e) { - | ${ev.isNull} = true; + | $parseErrorBranch |} catch (java.time.format.DateTimeParseException e) { - | ${ev.isNull} = true; + | $parseErrorBranch |} catch (java.text.ParseException e) { - | ${ev.isNull} = true; + | $parseErrorBranch |} |""".stripMargin) }.getOrElse { @@ -866,11 +886,11 @@ abstract class ToTimestamp |try { | ${ev.value} = $timestampFormatter.parse($string.toString()) / $downScaleFactor; |} catch (java.time.format.DateTimeParseException e) { - | ${ev.isNull} = true; + | $parseErrorBranch |} catch (java.time.DateTimeException e) { - | ${ev.isNull} = true; + | $parseErrorBranch |} catch (java.text.ParseException e) { - | ${ev.isNull} = true; + | $parseErrorBranch |} |""".stripMargin) } @@ -1737,7 +1757,8 @@ case class DateDiff(endDate: Expression, startDate: Expression) private case class GetTimestamp( left: Expression, right: Expression, - timeZoneId: Option[String] = None) + timeZoneId: Option[String] = None, + failOnError: Boolean = SQLConf.get.ansiEnabled) extends ToTimestamp { override val downScaleFactor = 1 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 3b974759bd6c0..87cf3c93ba26e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -364,6 +364,15 @@ object DateTimeUtils { } } + def stringToTimestampAnsi(s: UTF8String, timeZoneId: ZoneId): Long = { + val timestamp = stringToTimestamp(s, timeZoneId) + if (timestamp.isEmpty) { + throw new DateTimeException(s"Cannot cast $s to TimestampType.") + } else { + timestamp.get + } + } + /** * Gets the number of microseconds since the epoch of 1970-01-01 00:00:00Z from the given * instance of `java.time.Instant`. The epoch microsecond count is a simple incrementing count of diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala index f1fc921e401ba..0900a303b4cbe 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.catalyst.expressions import java.sql.{Date, Timestamp} +import java.time.DateTimeException import java.util.{Calendar, TimeZone} import scala.collection.parallel.immutable.ParVector @@ -106,8 +107,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(cast(Literal(str), TimestampType, Option(zid.getId)), expected) } - checkCastStringToTimestamp("123", null) - val tz = TimeZone.getTimeZone(zid) var c = Calendar.getInstance(tz) c.set(2015, 0, 1, 0, 0, 0) @@ -184,15 +183,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { c.set(2015, 2, 18, 12, 3, 17) c.set(Calendar.MILLISECOND, 123) checkCastStringToTimestamp("2015-03-18T12:03:17.123+7:3", new Timestamp(c.getTimeInMillis)) - - checkCastStringToTimestamp("2015-03-18 123142", null) - checkCastStringToTimestamp("2015-03-18T123123", null) - checkCastStringToTimestamp("2015-03-18X", null) - checkCastStringToTimestamp("2015/03/18", null) - checkCastStringToTimestamp("2015.03.18", null) - checkCastStringToTimestamp("20150318", null) - checkCastStringToTimestamp("2015-031-8", null) - checkCastStringToTimestamp("2015-03-18T12:03:17-0:70", null) } } @@ -302,7 +292,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { } checkEvaluation(cast("abdef", StringType), "abdef") - checkEvaluation(cast("abdef", TimestampType, UTC_OPT), null) checkEvaluation(cast("12.65", DecimalType.SYSTEM_DEFAULT), Decimal(12.65)) checkEvaluation(cast(cast(sd, DateType), StringType), sd) @@ -962,6 +951,34 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase { cast("abcd", DecimalType(38, 1)), "invalid input syntax for type numeric") } + + test("ANSI mode: cast string to timestamp with parse error") { + val activeConf = conf + new ParVector(ALL_TIMEZONES.toVector).foreach { zid => + def checkCastWithParseError(str: String): Unit = { + checkExceptionInExpression[DateTimeException]( + cast(Literal(str), TimestampType, Option(zid.getId)), + s"Cannot cast $str to TimestampType.") + } + + SQLConf.withExistingConf(activeConf) { + checkCastWithParseError("123") + checkCastWithParseError("2015-03-18 123142") + checkCastWithParseError("2015-03-18T123123") + checkCastWithParseError("2015-03-18X") + checkCastWithParseError("2015/03/18") + checkCastWithParseError("2015.03.18") + checkCastWithParseError("20150318") + checkCastWithParseError("2015-031-8") + checkCastWithParseError("2015-03-18T12:03:17-0:70") + + val input = "abdef" + checkExceptionInExpression[DateTimeException]( + cast(input, TimestampType, Option(zid.getId)), + s"Cannot cast $input to TimestampType.") + } + } + } } /** diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala index 85492084d51ac..a3ffc1129fd5e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala @@ -18,8 +18,9 @@ package org.apache.spark.sql.catalyst.expressions import java.sql.{Date, Timestamp} -import java.text.SimpleDateFormat +import java.text.{ParseException, SimpleDateFormat} import java.time.{Instant, LocalDate, ZoneId} +import java.time.format.DateTimeParseException import java.util.{Calendar, Locale, TimeZone} import java.util.concurrent.TimeUnit._ @@ -1286,4 +1287,58 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { testIntegralFunc(Long.MaxValue) testIntegralFunc(Long.MinValue) } -} + + test("SPARK-33498: GetTimestamp,UnixTimestamp,ToUnixTimestamp with parseError") { + Seq(true, false).foreach { ansiEnabled => + Seq("LEGACY", "CORRECTED", "EXCEPTION").foreach { policy => + withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> policy, + SQLConf.ANSI_ENABLED.key -> ansiEnabled.toString) { + + val exprSeq = Seq[Expression]( + GetTimestamp(Literal("2020-01-27T20:06:11.847"), Literal("yyyy-MM-dd HH:mm:ss.SSS")), + GetTimestamp(Literal("Unparseable"), Literal("yyyy-MM-dd HH:mm:ss.SSS")), + UnixTimestamp(Literal("2020-01-27T20:06:11.847"), Literal("yyyy-MM-dd HH:mm:ss.SSS")), + UnixTimestamp(Literal("Unparseable"), Literal("yyyy-MM-dd HH:mm:ss.SSS")), + ToUnixTimestamp(Literal("2020-01-27T20:06:11.847"), Literal("yyyy-MM-dd HH:mm:ss.SSS")), + ToUnixTimestamp(Literal("Unparseable"), Literal("yyyy-MM-dd HH:mm:ss.SSS")) + ) + + if (!ansiEnabled) { + exprSeq.foreach(checkEvaluation(_, null)) + } else if (policy == "LEGACY") { + exprSeq.foreach(checkExceptionInExpression[ParseException](_, "Unparseable")) + } else { + exprSeq.foreach( + checkExceptionInExpression[DateTimeParseException](_, "could not be parsed")) + } + + // LEGACY works, CORRECTED failed, EXCEPTION with SparkUpgradeException + val exprSeq2 = Seq[(Expression, Long)]( + (GetTimestamp(Literal("2020-01-27T20:06:11.847!!!"), + Literal("yyyy-MM-dd'T'HH:mm:ss.SSS")), 1580184371847000L), + (UnixTimestamp(Literal("2020-01-27T20:06:11.847!!!"), + Literal("yyyy-MM-dd'T'HH:mm:ss.SSS")), 1580184371L), + (ToUnixTimestamp(Literal("2020-01-27T20:06:11.847!!!"), + Literal("yyyy-MM-dd'T'HH:mm:ss.SSS")), 1580184371L) + ) + + if (policy == "LEGACY") { + exprSeq2.foreach(pair => checkEvaluation(pair._1, pair._2)) + } else if (policy == "EXCEPTION") { + exprSeq2.foreach(pair => + checkExceptionInExpression[SparkUpgradeException]( + pair._1, + "You may get a different result due to the upgrading of Spark 3.0")) + } else { + if (ansiEnabled) { + exprSeq2.foreach(pair => + checkExceptionInExpression[DateTimeParseException](pair._1, "could not be parsed")) + } else { + exprSeq2.foreach(pair => checkEvaluation(pair._1, null)) + } + } + } + } + } + } + } diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql index 19b4c53702662..534e222b7c13e 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql @@ -153,3 +153,14 @@ select from_json('{"t":"26/October/2015"}', 't Timestamp', map('timestampFormat' select from_json('{"d":"26/October/2015"}', 'd Date', map('dateFormat', 'dd/MMMMM/yyyy')); select from_csv('26/October/2015', 't Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy')); select from_csv('26/October/2015', 'd Date', map('dateFormat', 'dd/MMMMM/yyyy')); + +-- Timestamp type parse error +select to_date("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS"); +select to_date("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS"); +select to_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS"); +select to_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS"); +select unix_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS"); +select unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS"); +select to_unix_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS"); +select to_unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS"); +select cast("Unparseable" as timestamp) diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out index 5b357fd064e41..10669f14aa87b 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 108 +-- Number of queries: 117 -- !query @@ -301,9 +301,10 @@ struct -- !query select '1' - interval '2' second -- !query schema -struct +struct<> -- !query output -NULL +java.time.DateTimeException +Cannot cast 1 to TimestampType. -- !query @@ -600,9 +601,10 @@ struct -- !query select to_timestamp('2019-10-06 10:11:12.', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') -- !query schema -struct +struct<> -- !query output -NULL +java.time.format.DateTimeParseException +Text '2019-10-06 10:11:12.' could not be parsed at index 20 -- !query @@ -664,9 +666,10 @@ struct +struct<> -- !query output -NULL +java.time.format.DateTimeParseException +Text '2019-10-06 10:11:12.1234567PST' could not be parsed, unparsed text found at index 26 -- !query @@ -680,9 +683,10 @@ struct +struct<> -- !query output -NULL +java.time.format.DateTimeParseException +Text '223456 2019-10-06 10:11:12.123456PST' could not be parsed at index 27 -- !query @@ -744,17 +748,19 @@ struct +struct<> -- !query output -NULL +java.time.format.DateTimeParseException +Text '12.1232019-10-06S10:11' could not be parsed at index 7 -- !query select to_timestamp("12.1232019-10-06S10:11", "ss.SSSSyy-MM-dd'S'HH:mm") -- !query schema -struct +struct<> -- !query output -NULL +java.time.format.DateTimeParseException +Text '12.1232019-10-06S10:11' could not be parsed at index 9 -- !query @@ -824,9 +830,10 @@ struct -- !query select to_timestamp("02-29", "MM-dd") -- !query schema -struct +struct<> -- !query output -NULL +java.time.DateTimeException +Invalid date 'February 29' as '1970' is not a leap year -- !query @@ -840,9 +847,10 @@ struct -- !query select to_date("02-29", "MM-dd") -- !query schema -struct +struct<> -- !query output -NULL +java.time.DateTimeException +Invalid date 'February 29' as '1970' is not a leap year -- !query @@ -931,3 +939,84 @@ struct<> -- !query output org.apache.spark.SparkUpgradeException You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd/MMMMM/yyyy' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select to_date("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct<> +-- !query output +java.time.format.DateTimeParseException +Text '2020-01-27T20:06:11.847' could not be parsed at index 10 + + +-- !query +select to_date("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct<> +-- !query output +java.time.format.DateTimeParseException +Text 'Unparseable' could not be parsed at index 0 + + +-- !query +select to_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct<> +-- !query output +java.time.format.DateTimeParseException +Text '2020-01-27T20:06:11.847' could not be parsed at index 10 + + +-- !query +select to_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct<> +-- !query output +java.time.format.DateTimeParseException +Text 'Unparseable' could not be parsed at index 0 + + +-- !query +select unix_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct<> +-- !query output +java.time.format.DateTimeParseException +Text '2020-01-27T20:06:11.847' could not be parsed at index 10 + + +-- !query +select unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct<> +-- !query output +java.time.format.DateTimeParseException +Text 'Unparseable' could not be parsed at index 0 + + +-- !query +select to_unix_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct<> +-- !query output +java.time.format.DateTimeParseException +Text '2020-01-27T20:06:11.847' could not be parsed at index 10 + + +-- !query +select to_unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct<> +-- !query output +java.time.format.DateTimeParseException +Text 'Unparseable' could not be parsed at index 0 + + +-- !query +select cast("Unparseable" as timestamp) +-- !query schema +struct<> +-- !query output +java.time.DateTimeException +Cannot cast Unparseable to TimestampType. diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out index 8727b74d771ee..7c2c62a2db496 100644 --- a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 108 +-- Number of queries: 117 -- !query @@ -901,3 +901,75 @@ select from_csv('26/October/2015', 'd Date', map('dateFormat', 'dd/MMMMM/yyyy')) struct> -- !query output {"d":2015-10-26} + + +-- !query +select to_date("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_date("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select unix_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_unix_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select cast("Unparseable" as timestamp) +-- !query schema +struct +-- !query output +NULL diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out index 850cc86d943d3..810ab6ef0cbfc 100755 --- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 108 +-- Number of queries: 117 -- !query @@ -909,3 +909,75 @@ struct<> -- !query output org.apache.spark.SparkUpgradeException You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'dd/MMMMM/yyyy' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html + + +-- !query +select to_date("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_date("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select unix_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_unix_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS") +-- !query schema +struct +-- !query output +NULL + + +-- !query +select cast("Unparseable" as timestamp) +-- !query schema +struct +-- !query output +NULL diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part3.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part3.sql.out index 553432e503d5c..0e177f7ea82bd 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part3.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part3.sql.out @@ -71,7 +71,8 @@ insert into datetimes values -- !query schema struct<> -- !query output - +org.apache.spark.sql.AnalysisException +failed to evaluate expression CAST('11:00 BST' AS TIMESTAMP): Cannot cast 11:00 BST to TimestampType.; line 1 pos 22 -- !query From 35ded12fc67a3d8e51f8be3186246745a72a05bc Mon Sep 17 00:00:00 2001 From: luluorta Date: Fri, 27 Nov 2020 13:32:25 +0000 Subject: [PATCH 028/150] [SPARK-33141][SQL] Capture SQL configs when creating permanent views ### What changes were proposed in this pull request? This PR makes CreateViewCommand/AlterViewAsCommand capturing runtime SQL configs and store them as view properties. These configs will be applied during the parsing and analysis phases of the view resolution. Users can set `spark.sql.legacy.useCurrentConfigsForView` to `true` to restore the behavior before. ### Why are the changes needed? This PR is a sub-task of [SPARK-33138](https://issues.apache.org/jira/browse/SPARK-33138) that proposes to unify temp view and permanent view behaviors. This PR makes permanent views mimicking the temp view behavior that "fixes" view semantic by directly storing resolved LogicalPlan. For example, if a user uses spark 2.4 to create a view that contains null values from division-by-zero expressions, she may not want that other users' queries which reference her view throw exceptions when running on spark 3.x with ansi mode on. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? added UT + existing UTs (improved) Closes #30289 from luluorta/SPARK-33141. Authored-by: luluorta Signed-off-by: Wenchen Fan --- docs/sql-migration-guide.md | 2 + .../sql/catalyst/analysis/Analyzer.scala | 4 +- .../sql/catalyst/catalog/SessionCatalog.scala | 9 ++- .../sql/catalyst/catalog/interface.scala | 18 +++++ .../plans/logical/basicLogicalOperators.scala | 16 ++++ .../apache/spark/sql/internal/SQLConf.scala | 11 +++ .../spark/sql/execution/command/views.scala | 49 +++++++++++- .../results/postgreSQL/create_view.sql.out | 28 +++---- .../spark/sql/execution/SQLViewSuite.scala | 75 ++++++++++++++++++- 9 files changed, 190 insertions(+), 22 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 6942ef7201703..7997090e710a9 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -51,6 +51,8 @@ license: | - In Spark 3.1, the `schema_of_json` and `schema_of_csv` functions return the schema in the SQL format in which field names are quoted. In Spark 3.0, the function returns a catalog string without field quoting and in lower case. - In Spark 3.1, refreshing a table will trigger an uncache operation for all other caches that reference the table, even if the table itself is not cached. In Spark 3.0 the operation will only be triggered if the table itself is cached. + + - In Spark 3.1, creating or altering a view will capture runtime SQL configs and store them as view properties. These configs will be applied during the parsing and analysis phases of the view resolution. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.useCurrentConfigsForView` to `true`. ## Upgrading from Spark SQL 3.0 to 3.0.1 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 77c1dd9ebb7fa..dae496244c858 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -1034,7 +1034,9 @@ class Analyzer(override val catalogManager: CatalogManager) s"avoid errors. Increase the value of ${SQLConf.MAX_NESTED_VIEW_DEPTH.key} to work " + "around this.") } - executeSameContext(child) + SQLConf.withExistingConf(View.effectiveSQLConf(desc.viewSQLConfigs)) { + executeSameContext(child) + } } view.copy(child = newChild) case p @ SubqueryAlias(_, view: View) => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 17ab6664df75c..5122ca7521d9a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -795,14 +795,19 @@ class SessionCatalog( if (metadata.tableType == CatalogTableType.VIEW) { val viewText = metadata.viewText.getOrElse(sys.error("Invalid view without text.")) - logDebug(s"'$viewText' will be used for the view($table).") + val viewConfigs = metadata.viewSQLConfigs + val viewPlan = SQLConf.withExistingConf(View.effectiveSQLConf(viewConfigs)) { + parser.parsePlan(viewText) + } + + logDebug(s"'$viewText' will be used for the view($table) with configs: $viewConfigs.") // The relation is a view, so we wrap the relation by: // 1. Add a [[View]] operator over the relation to keep track of the view desc; // 2. Wrap the logical plan in a [[SubqueryAlias]] which tracks the name of the view. val child = View( desc = metadata, output = metadata.schema.toAttributes, - child = parser.parsePlan(viewText)) + child = viewPlan) SubqueryAlias(multiParts, child) } else { SubqueryAlias(multiParts, UnresolvedCatalogRelation(metadata, options)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index ee7216e93ebb5..621ad84f1f5ec 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -305,6 +305,22 @@ case class CatalogTable( } } + /** + * Return the SQL configs of when the view was created, the configs are applied when parsing and + * analyzing the view, should be empty if the CatalogTable is not a View or created by older + * versions of Spark(before 3.1.0). + */ + def viewSQLConfigs: Map[String, String] = { + try { + for ((key, value) <- properties if key.startsWith(CatalogTable.VIEW_SQL_CONFIG_PREFIX)) + yield (key.substring(CatalogTable.VIEW_SQL_CONFIG_PREFIX.length), value) + } catch { + case e: Exception => + throw new AnalysisException( + "Corrupted view SQL configs in catalog", cause = Some(e)) + } + } + /** * Return the output column names of the query that creates a view, the column names are used to * resolve a view, should be empty if the CatalogTable is not a View or created by older versions @@ -411,6 +427,8 @@ object CatalogTable { props.toMap } + val VIEW_SQL_CONFIG_PREFIX = VIEW_PREFIX + "sqlConfig." + val VIEW_QUERY_OUTPUT_PREFIX = VIEW_PREFIX + "query.out." val VIEW_QUERY_OUTPUT_NUM_COLUMNS = VIEW_QUERY_OUTPUT_PREFIX + "numCols" val VIEW_QUERY_OUTPUT_COLUMN_NAME_PREFIX = VIEW_QUERY_OUTPUT_PREFIX + "col." diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index c7108ea8ac74b..a524ed4ff73e9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -453,6 +453,22 @@ case class View( } } +object View { + def effectiveSQLConf(configs: Map[String, String]): SQLConf = { + val activeConf = SQLConf.get + if (activeConf.useCurrentSQLConfigsForView) return activeConf + + val sqlConf = new SQLConf() + for ((k, v) <- configs) { + sqlConf.settings.put(k, v) + } + // We should respect the current maxNestedViewDepth cause the view resolving are executed + // from top to down. + sqlConf.setConf(SQLConf.MAX_NESTED_VIEW_DEPTH, activeConf.maxNestedViewDepth) + sqlConf + } +} + /** * A container for holding named common table expressions (CTEs) and a query plan. * This operator will be removed during analysis and the relations will be substituted into child. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index add9a1d0f3aa6..b2c28ffa984a9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1481,6 +1481,15 @@ object SQLConf { "must be positive.") .createWithDefault(100) + val USE_CURRENT_SQL_CONFIGS_FOR_VIEW = + buildConf("spark.sql.legacy.useCurrentConfigsForView") + .internal() + .doc("When true, SQL Configs of the current active SparkSession instead of the captured " + + "ones will be applied during the parsing and analysis phases of the view resolution.") + .version("3.1.0") + .booleanConf + .createWithDefault(false) + val STREAMING_FILE_COMMIT_PROTOCOL_CLASS = buildConf("spark.sql.streaming.commitProtocolClass") .version("2.1.0") @@ -3415,6 +3424,8 @@ class SQLConf extends Serializable with Logging { def maxNestedViewDepth: Int = getConf(SQLConf.MAX_NESTED_VIEW_DEPTH) + def useCurrentSQLConfigsForView: Boolean = getConf(SQLConf.USE_CURRENT_SQL_CONFIGS_FOR_VIEW) + def starSchemaDetection: Boolean = getConf(STARSCHEMA_DETECTION) def starSchemaFTRatio: Double = getConf(STARSCHEMA_FACT_TABLE_RATIO) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala index 43bc50522f2a8..a02f863a360f8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeRef import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, View} import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.NamespaceHelper -import org.apache.spark.sql.internal.StaticSQLConf +import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} import org.apache.spark.sql.types.{BooleanType, MetadataBuilder, StringType} import org.apache.spark.sql.util.SchemaUtils @@ -334,6 +334,18 @@ case class ShowViewsCommand( object ViewHelper { + private val configPrefixDenyList = Seq( + SQLConf.MAX_NESTED_VIEW_DEPTH.key, + "spark.sql.optimizer.", + "spark.sql.codegen.", + "spark.sql.execution.", + "spark.sql.shuffle.", + "spark.sql.adaptive.") + + private def shouldCaptureConfig(key: String): Boolean = { + !configPrefixDenyList.exists(prefix => key.startsWith(prefix)) + } + import CatalogTable._ /** @@ -361,11 +373,37 @@ object ViewHelper { } } + /** + * Convert the view SQL configs to `properties`. + */ + private def sqlConfigsToProps(conf: SQLConf): Map[String, String] = { + val modifiedConfs = conf.getAllConfs.filter { case (k, _) => + conf.isModifiable(k) && shouldCaptureConfig(k) + } + val props = new mutable.HashMap[String, String] + for ((key, value) <- modifiedConfs) { + props.put(s"$VIEW_SQL_CONFIG_PREFIX$key", value) + } + props.toMap + } + + /** + * Remove the view SQL configs in `properties`. + */ + private def removeSQLConfigs(properties: Map[String, String]): Map[String, String] = { + // We can't use `filterKeys` here, as the map returned by `filterKeys` is not serializable, + // while `CatalogTable` should be serializable. + properties.filterNot { case (key, _) => + key.startsWith(VIEW_SQL_CONFIG_PREFIX) + } + } + /** * Generate the view properties in CatalogTable, including: * 1. view default database that is used to provide the default database name on view resolution. * 2. the output column names of the query that creates a view, this is used to map the output of * the view child to the view output during view resolution. + * 3. the SQL configs when creating the view. * * @param properties the `properties` in CatalogTable. * @param session the spark session. @@ -380,15 +418,18 @@ object ViewHelper { // for createViewCommand queryOutput may be different from fieldNames val queryOutput = analyzedPlan.schema.fieldNames + val conf = session.sessionState.conf + // Generate the query column names, throw an AnalysisException if there exists duplicate column // names. SchemaUtils.checkColumnNameDuplication( - fieldNames, "in the view definition", session.sessionState.conf.resolver) + fieldNames, "in the view definition", conf.resolver) - // Generate the view default catalog and namespace. + // Generate the view default catalog and namespace, as well as captured SQL configs. val manager = session.sessionState.catalogManager - removeQueryColumnNames(properties) ++ + removeSQLConfigs(removeQueryColumnNames(properties)) ++ catalogAndNamespaceToProps(manager.currentCatalog.name, manager.currentNamespace) ++ + sqlConfigsToProps(conf) ++ generateQueryColumnNames(queryOutput) } diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out index ae1cb2f171704..2fab32fa4b4eb 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out @@ -257,7 +257,7 @@ View Text SELECT * FROM base_table View Original Text SELECT * FROM base_table View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] -- !query @@ -313,7 +313,7 @@ View Text SELECT * FROM base_table View Original Text SELECT * FROM base_table View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] -- !query @@ -359,7 +359,7 @@ View Original Text SELECT t1.a AS t1_a, t2.a AS t2_a WHERE t1.id = t2.id View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [t1_a, t2_a] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=t1_a, view.query.out.numCols=2, view.query.out.col.1=t2_a, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=t1_a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=t2_a, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] -- !query @@ -413,7 +413,7 @@ View Text SELECT * FROM base_table WHERE id IN (SELECT id FROM base_t View Original Text SELECT * FROM base_table WHERE id IN (SELECT id FROM base_table2) View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] -- !query @@ -443,7 +443,7 @@ View Text SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM base_ View Original Text SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM base_table2) t2 View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [id, a] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=id, view.query.out.numCols=2, view.query.out.col.1=a, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=id, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=a, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] -- !query @@ -473,7 +473,7 @@ View Text SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM base_t View Original Text SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM base_table2) View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] -- !query @@ -503,7 +503,7 @@ View Text SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM ba View Original Text SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM base_table2) View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] -- !query @@ -533,7 +533,7 @@ View Text SELECT * FROM base_table WHERE EXISTS (SELECT 1) View Original Text SELECT * FROM base_table WHERE EXISTS (SELECT 1) View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] -- !query @@ -669,7 +669,7 @@ View Text SELECT * FROM t1 CROSS JOIN t2 View Original Text SELECT * FROM t1 CROSS JOIN t2 View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [num, name, num2, value] -Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.catalogAndNamespace.part.1=testviewschm2] +Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.catalogAndNamespace.part.1=testviewschm2] -- !query @@ -710,7 +710,7 @@ View Text SELECT * FROM t1 INNER JOIN t2 ON t1.num = t2.num2 View Original Text SELECT * FROM t1 INNER JOIN t2 ON t1.num = t2.num2 View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [num, name, num2, value] -Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.catalogAndNamespace.part.1=testviewschm2] +Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.catalogAndNamespace.part.1=testviewschm2] -- !query @@ -751,7 +751,7 @@ View Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 View Original Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [num, name, num2, value] -Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.catalogAndNamespace.part.1=testviewschm2] +Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.catalogAndNamespace.part.1=testviewschm2] -- !query @@ -792,7 +792,7 @@ View Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 AND t2.va View Original Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 AND t2.value = 'xxx' View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [num, name, num2, value] -Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.catalogAndNamespace.part.1=testviewschm2] +Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.catalogAndNamespace.part.1=testviewschm2] -- !query @@ -894,7 +894,7 @@ BETWEEN (SELECT d FROM tbl2 WHERE c = 1) AND (SELECT e FROM tbl3 WHERE f = 2) AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [a, b] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.query.out.col.1=b, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=testviewschm2] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=b, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=testviewschm2] -- !query @@ -933,7 +933,7 @@ AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) AND NOT EXISTS (SELECT g FROM tbl4 LEFT JOIN tmptbl ON tbl4.h = tmptbl.j) View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [a, b] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.query.out.col.1=b, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=testviewschm2] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=b, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=testviewschm2] -- !query diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala index d776198bc7470..0b19f706836be 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.sql.catalyst.parser.ParseException -import org.apache.spark.sql.internal.SQLConf.MAX_NESTED_VIEW_DEPTH +import org.apache.spark.sql.internal.SQLConf._ import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} class SimpleSQLViewSuite extends SQLViewSuite with SharedSparkSession @@ -762,4 +762,77 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { } } } + + test("SPARK-33141: view should be parsed and analyzed with configs set when creating") { + withTable("t") { + withView("v1", "v2", "v3", "v4", "v5") { + Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") + sql("CREATE VIEW v1 (c1) AS SELECT C1 FROM t") + sql("CREATE VIEW v2 (c1) AS SELECT c1 FROM t ORDER BY 1 ASC, c1 DESC") + sql("CREATE VIEW v3 (c1, count) AS SELECT c1, count(c1) FROM t GROUP BY 1") + sql("CREATE VIEW v4 (a, count) AS SELECT c1 as a, count(c1) FROM t GROUP BY a") + sql("CREATE VIEW v5 (c1) AS SELECT 1/0") + + withSQLConf(CASE_SENSITIVE.key -> "true") { + checkAnswer(sql("SELECT * FROM v1"), Seq(Row(2), Row(3), Row(1))) + } + withSQLConf(ORDER_BY_ORDINAL.key -> "false") { + checkAnswer(sql("SELECT * FROM v2"), Seq(Row(1), Row(2), Row(3))) + } + withSQLConf(GROUP_BY_ORDINAL.key -> "false") { + checkAnswer(sql("SELECT * FROM v3"), + Seq(Row(1, 1), Row(2, 1), Row(3, 1))) + } + withSQLConf(GROUP_BY_ALIASES.key -> "false") { + checkAnswer(sql("SELECT * FROM v4"), + Seq(Row(1, 1), Row(2, 1), Row(3, 1))) + } + withSQLConf(ANSI_ENABLED.key -> "true") { + checkAnswer(sql("SELECT * FROM v5"), Seq(Row(null))) + } + + withSQLConf(USE_CURRENT_SQL_CONFIGS_FOR_VIEW.key -> "true") { + withSQLConf(CASE_SENSITIVE.key -> "true") { + val e = intercept[AnalysisException] { + sql("SELECT * FROM v1") + }.getMessage + assert(e.contains("cannot resolve '`C1`' given input columns: " + + "[spark_catalog.default.t.c1]")) + } + withSQLConf(ORDER_BY_ORDINAL.key -> "false") { + checkAnswer(sql("SELECT * FROM v2"), Seq(Row(3), Row(2), Row(1))) + } + withSQLConf(GROUP_BY_ORDINAL.key -> "false") { + val e = intercept[AnalysisException] { + sql("SELECT * FROM v3") + }.getMessage + assert(e.contains("expression 'spark_catalog.default.t.`c1`' is neither present " + + "in the group by, nor is it an aggregate function. Add to group by or wrap in " + + "first() (or first_value) if you don't care which value you get.")) + } + withSQLConf(GROUP_BY_ALIASES.key -> "false") { + val e = intercept[AnalysisException] { + sql("SELECT * FROM v4") + }.getMessage + assert(e.contains("cannot resolve '`a`' given input columns: " + + "[spark_catalog.default.t.c1]")) + } + withSQLConf(ANSI_ENABLED.key -> "true") { + val e = intercept[ArithmeticException] { + sql("SELECT * FROM v5").collect() + }.getMessage + assert(e.contains("divide by zero")) + } + } + + withSQLConf(ANSI_ENABLED.key -> "true") { + sql("ALTER VIEW v1 AS SELECT 1/0") + } + val e = intercept[ArithmeticException] { + sql("SELECT * FROM v1").collect() + }.getMessage + assert(e.contains("divide by zero")) + } + } + } } From 13fd272cd353c8aa40a6030c4c847c2e2f632f68 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Fri, 27 Nov 2020 10:22:45 -0600 Subject: [PATCH 029/150] Spelling r common dev mlib external project streaming resource managers python ### What changes were proposed in this pull request? This PR intends to fix typos in the sub-modules: * `R` * `common` * `dev` * `mlib` * `external` * `project` * `streaming` * `resource-managers` * `python` Split per srowen https://github.com/apache/spark/pull/30323#issuecomment-728981618 NOTE: The misspellings have been reported at https://github.com/jsoref/spark/commit/706a726f87a0bbf5e31467fae9015218773db85b#commitcomment-44064356 ### Why are the changes needed? Misspelled words make it harder to read / understand content. ### Does this PR introduce _any_ user-facing change? There are various fixes to documentation, etc... ### How was this patch tested? No testing was performed Closes #30402 from jsoref/spelling-R_common_dev_mlib_external_project_streaming_resource-managers_python. Authored-by: Josh Soref Signed-off-by: Sean Owen --- R/CRAN_RELEASE.md | 2 +- R/install-dev.bat | 2 +- R/pkg/R/DataFrame.R | 6 ++--- R/pkg/R/RDD.R | 4 ++-- R/pkg/R/SQLContext.R | 2 +- R/pkg/R/WindowSpec.R | 4 ++-- R/pkg/R/column.R | 16 +++++++------- R/pkg/R/context.R | 4 ++-- R/pkg/R/deserialize.R | 2 +- R/pkg/R/functions.R | 4 ++-- R/pkg/R/install.R | 2 +- R/pkg/R/mllib_fpm.R | 2 +- R/pkg/R/mllib_tree.R | 4 ++-- R/pkg/R/mllib_utils.R | 2 +- R/pkg/R/pairRDD.R | 4 ++-- R/pkg/R/streaming.R | 2 +- R/pkg/R/types.R | 2 +- R/pkg/R/utils.R | 2 +- R/pkg/inst/worker/daemon.R | 4 ++-- R/pkg/inst/worker/worker.R | 8 +++---- R/pkg/tests/fulltests/test_Serde.R | 2 +- R/pkg/tests/fulltests/test_jvm_api.R | 6 ++--- R/pkg/tests/fulltests/test_sparkSQL.R | 6 ++--- R/pkg/tests/fulltests/test_utils.R | 2 +- R/pkg/vignettes/sparkr-vignettes.Rmd | 2 +- .../spark/util/kvstore/LevelDBTypeInfo.java | 2 +- .../spark/network/client/TransportClient.java | 2 +- .../spark/network/crypto/AuthEngine.java | 2 +- .../spark/network/crypto/AuthEngineSuite.java | 10 ++++----- .../protocol/MessageWithHeaderSuite.java | 4 ++-- .../spark/network/sasl/SparkSaslSuite.java | 16 +++++++------- .../server/OneForOneStreamManagerSuite.java | 2 +- .../util/TransportFrameDecoderSuite.java | 2 +- .../network/shuffle/SimpleDownloadFile.java | 2 +- .../apache/spark/unsafe/types/UTF8String.java | 10 ++++----- .../types/UTF8StringPropertyCheckSuite.scala | 6 ++--- dev/appveyor-guide.md | 12 +++++----- dev/create-release/known_translations | 2 +- dev/create-release/release-build.sh | 2 +- dev/create-release/releaseutils.py | 6 ++--- dev/create-release/translate-contributors.py | 22 +++++++++---------- dev/github_jira_sync.py | 10 ++++----- dev/run-tests-jenkins.py | 18 +++++++-------- dev/run-tests.py | 6 ++--- dev/tests/pr_merge_ability.sh | 2 +- dev/tests/pr_public_classes.sh | 2 +- project/MimaExcludes.scala | 2 +- project/SparkBuild.scala | 6 ++--- python/docs/source/_static/css/pyspark.css | 2 +- .../source/_templates/autosummary/class.rst | 2 +- python/docs/source/development/debugging.rst | 2 +- python/docs/source/development/testing.rst | 2 +- .../docs/source/getting_started/install.rst | 6 ++--- .../source/getting_started/quickstart.ipynb | 4 ++-- python/docs/source/index.rst | 2 +- python/pyspark/__init__.pyi | 2 +- python/pyspark/cloudpickle/cloudpickle.py | 10 ++++----- .../pyspark/cloudpickle/cloudpickle_fast.py | 10 ++++----- python/pyspark/context.py | 4 ++-- python/pyspark/java_gateway.py | 2 +- python/pyspark/ml/feature.py | 2 +- python/pyspark/ml/regression.py | 2 +- python/pyspark/ml/regression.pyi | 2 +- python/pyspark/ml/tests/test_algorithms.py | 2 +- python/pyspark/ml/tests/test_image.py | 2 +- python/pyspark/mllib/clustering.py | 2 +- python/pyspark/mllib/evaluation.py | 4 ++-- python/pyspark/mllib/regression.py | 2 +- python/pyspark/mllib/stat/_statistics.py | 2 +- .../mllib/tests/test_streaming_algorithms.py | 2 +- python/pyspark/rdd.py | 4 ++-- python/pyspark/resource/requests.py | 4 ++-- python/pyspark/shuffle.py | 2 +- python/pyspark/sql/column.py | 2 +- python/pyspark/sql/dataframe.py | 2 +- python/pyspark/sql/functions.py | 14 ++++++------ .../sql/pandas/_typing/protocols/frame.pyi | 2 +- .../sql/pandas/_typing/protocols/series.pyi | 2 +- python/pyspark/sql/pandas/functions.py | 4 ++-- .../sql/tests/test_pandas_grouped_map.py | 2 +- python/pyspark/sql/tests/test_udf.py | 4 ++-- python/pyspark/sql/utils.py | 6 ++--- python/pyspark/streaming/context.py | 2 +- python/pyspark/tests/test_context.py | 4 ++-- python/pyspark/worker.py | 2 +- python/test_support/userlibrary.py | 2 +- .../org/apache/spark/deploy/k8s/Config.scala | 2 +- .../k8s/ExecutorPodsSnapshotsStoreImpl.scala | 4 ++-- .../k8s/KubernetesVolumeUtilsSuite.scala | 4 ++-- .../MountVolumesFeatureStepSuite.scala | 2 +- .../apache/spark/deploy/mesos/config.scala | 2 +- .../cluster/mesos/MesosSchedulerUtils.scala | 2 +- .../spark/deploy/yarn/YarnAllocator.scala | 2 +- .../apache/hadoop/net/ServerSocketUtil.java | 2 +- .../yarn/YarnShuffleServiceSuite.scala | 2 +- .../streaming/api/python/PythonDStream.scala | 2 +- .../spark/streaming/dstream/DStream.scala | 4 ++-- .../spark/streaming/util/HdfsUtils.scala | 2 +- .../apache/spark/streaming/JavaAPISuite.java | 2 +- .../spark/streaming/MapWithStateSuite.scala | 6 ++--- .../streaming/rdd/MapWithStateRDDSuite.scala | 6 ++--- 101 files changed, 208 insertions(+), 208 deletions(-) diff --git a/R/CRAN_RELEASE.md b/R/CRAN_RELEASE.md index 4d9b6416c01cb..2f410cf8bfd94 100644 --- a/R/CRAN_RELEASE.md +++ b/R/CRAN_RELEASE.md @@ -25,7 +25,7 @@ To release SparkR as a package to CRAN, we would use the `devtools` package. Ple First, check that the `Version:` field in the `pkg/DESCRIPTION` file is updated. Also, check for stale files not under source control. -Note that while `run-tests.sh` runs `check-cran.sh` (which runs `R CMD check`), it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. Also note that for CRAN checks for pdf vignettes to success, `qpdf` tool must be there (to install it, eg. `yum -q -y install qpdf`). +Note that while `run-tests.sh` runs `check-cran.sh` (which runs `R CMD check`), it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. Also note that for CRAN checks for pdf vignettes to success, `qpdf` tool must be there (to install it, e.g. `yum -q -y install qpdf`). To upload a release, we would need to update the `cran-comments.md`. This should generally contain the results from running the `check-cran.sh` script along with comments on status of all `WARNING` (should not be any) or `NOTE`. As a part of `check-cran.sh` and the release process, the vignettes is build - make sure `SPARK_HOME` is set and Spark jars are accessible. diff --git a/R/install-dev.bat b/R/install-dev.bat index c570d93049a14..ae5aa589a19d1 100644 --- a/R/install-dev.bat +++ b/R/install-dev.bat @@ -26,7 +26,7 @@ MKDIR %SPARK_HOME%\R\lib rem When you pass the package path directly as an argument to R CMD INSTALL, rem it takes the path as 'C:\projects\spark\R\..\R\pkg"' as an example at -rem R 4.0. To work around this, directly go to the directoy and install it. +rem R 4.0. To work around this, directly go to the directory and install it. rem See also SPARK-32074 pushd %SPARK_HOME%\R\pkg\ R.exe CMD INSTALL --library="%SPARK_HOME%\R\lib" . diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 2ce53782d9af0..31a651ea1279b 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2772,7 +2772,7 @@ setMethod("merge", #' Creates a list of columns by replacing the intersected ones with aliases #' #' Creates a list of columns by replacing the intersected ones with aliases. -#' The name of the alias column is formed by concatanating the original column name and a suffix. +#' The name of the alias column is formed by concatenating the original column name and a suffix. #' #' @param x a SparkDataFrame #' @param intersectedColNames a list of intersected column names of the SparkDataFrame @@ -3231,7 +3231,7 @@ setMethod("describe", #' \item stddev #' \item min #' \item max -#' \item arbitrary approximate percentiles specified as a percentage (eg, "75\%") +#' \item arbitrary approximate percentiles specified as a percentage (e.g., "75\%") #' } #' If no statistics are given, this function computes count, mean, stddev, min, #' approximate quartiles (percentiles at 25\%, 50\%, and 75\%), and max. @@ -3743,7 +3743,7 @@ setMethod("histogram", #' #' @param x a SparkDataFrame. #' @param url JDBC database url of the form \code{jdbc:subprotocol:subname}. -#' @param tableName yhe name of the table in the external database. +#' @param tableName the name of the table in the external database. #' @param mode one of 'append', 'overwrite', 'error', 'errorifexists', 'ignore' #' save mode (it is 'error' by default) #' @param ... additional JDBC database connection properties. diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index 7a1d157bb8a36..408a3ff25b2b2 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -970,7 +970,7 @@ setMethod("takeSample", signature(x = "RDD", withReplacement = "logical", MAXINT))))) # If the first sample didn't turn out large enough, keep trying to # take samples; this shouldn't happen often because we use a big - # multiplier for thei initial size + # multiplier for the initial size while (length(samples) < total) samples <- collectRDD(sampleRDD(x, withReplacement, fraction, as.integer(ceiling(stats::runif(1, @@ -1512,7 +1512,7 @@ setMethod("glom", #' #' @param x An RDD. #' @param y An RDD. -#' @return a new RDD created by performing the simple union (witout removing +#' @return a new RDD created by performing the simple union (without removing #' duplicates) of two input RDDs. #' @examples #'\dontrun{ diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index c0ac68332ec41..5ed0481f33d8f 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -203,7 +203,7 @@ getSchema <- function(schema, firstRow = NULL, rdd = NULL) { }) } - # SPAKR-SQL does not support '.' in column name, so replace it with '_' + # SPARK-SQL does not support '.' in column name, so replace it with '_' # TODO(davies): remove this once SPARK-2775 is fixed names <- lapply(names, function(n) { nn <- gsub(".", "_", n, fixed = TRUE) diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R index 037809cd0923e..be47d0117ed7f 100644 --- a/R/pkg/R/WindowSpec.R +++ b/R/pkg/R/WindowSpec.R @@ -54,7 +54,7 @@ setMethod("show", "WindowSpec", #' Defines the partitioning columns in a WindowSpec. #' #' @param x a WindowSpec. -#' @param col a column to partition on (desribed by the name or Column). +#' @param col a column to partition on (described by the name or Column). #' @param ... additional column(s) to partition on. #' @return A WindowSpec. #' @rdname partitionBy @@ -231,7 +231,7 @@ setMethod("rangeBetween", #' @rdname over #' @name over #' @aliases over,Column,WindowSpec-method -#' @family colum_func +#' @family column_func #' @examples #' \dontrun{ #' df <- createDataFrame(mtcars) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 835178990b485..9fa117ccb6281 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -135,7 +135,7 @@ createMethods() #' @rdname alias #' @name alias #' @aliases alias,Column-method -#' @family colum_func +#' @family column_func #' @examples #' \dontrun{ #' df <- createDataFrame(iris) @@ -161,7 +161,7 @@ setMethod("alias", #' #' @rdname substr #' @name substr -#' @family colum_func +#' @family column_func #' @aliases substr,Column-method #' #' @param x a Column. @@ -187,7 +187,7 @@ setMethod("substr", signature(x = "Column"), #' #' @rdname startsWith #' @name startsWith -#' @family colum_func +#' @family column_func #' @aliases startsWith,Column-method #' #' @param x vector of character string whose "starts" are considered @@ -206,7 +206,7 @@ setMethod("startsWith", signature(x = "Column"), #' #' @rdname endsWith #' @name endsWith -#' @family colum_func +#' @family column_func #' @aliases endsWith,Column-method #' #' @param x vector of character string whose "ends" are considered @@ -224,7 +224,7 @@ setMethod("endsWith", signature(x = "Column"), #' #' @rdname between #' @name between -#' @family colum_func +#' @family column_func #' @aliases between,Column-method #' #' @param x a Column @@ -251,7 +251,7 @@ setMethod("between", signature(x = "Column"), # nolint end #' @rdname cast #' @name cast -#' @family colum_func +#' @family column_func #' @aliases cast,Column-method #' #' @examples @@ -300,7 +300,7 @@ setMethod("%in%", #' Can be a single value or a Column. #' @rdname otherwise #' @name otherwise -#' @family colum_func +#' @family column_func #' @aliases otherwise,Column-method #' @note otherwise since 1.5.0 setMethod("otherwise", @@ -440,7 +440,7 @@ setMethod("withField", #' ) #' #' # However, if you are going to add/replace multiple nested fields, -#' # it is preffered to extract out the nested struct before +#' # it is preferred to extract out the nested struct before #' # adding/replacing multiple fields e.g. #' head( #' withColumn( diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index e3c9d9f8793d6..cca6c2c817de9 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -86,7 +86,7 @@ makeSplits <- function(numSerializedSlices, length) { # For instance, for numSerializedSlices of 22, length of 50 # [1] 0 0 2 2 4 4 6 6 6 9 9 11 11 13 13 15 15 15 18 18 20 20 22 22 22 # [26] 25 25 27 27 29 29 31 31 31 34 34 36 36 38 38 40 40 40 43 43 45 45 47 47 47 - # Notice the slice group with 3 slices (ie. 6, 15, 22) are roughly evenly spaced. + # Notice the slice group with 3 slices (i.e. 6, 15, 22) are roughly evenly spaced. # We are trying to reimplement the calculation in the positions method in ParallelCollectionRDD if (numSerializedSlices > 0) { unlist(lapply(0: (numSerializedSlices - 1), function(x) { @@ -116,7 +116,7 @@ makeSplits <- function(numSerializedSlices, length) { #' This change affects both createDataFrame and spark.lapply. #' In the specific one case that it is used to convert R native object into SparkDataFrame, it has #' always been kept at the default of 1. In the case the object is large, we are explicitly setting -#' the parallism to numSlices (which is still 1). +#' the parallelism to numSlices (which is still 1). #' #' Specifically, we are changing to split positions to match the calculation in positions() of #' ParallelCollectionRDD in Spark. diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R index 5d22340fb62a0..89a8fbecd36b0 100644 --- a/R/pkg/R/deserialize.R +++ b/R/pkg/R/deserialize.R @@ -250,7 +250,7 @@ readDeserializeWithKeysInArrow <- function(inputCon) { keys <- readMultipleObjects(inputCon) - # Read keys to map with each groupped batch later. + # Read keys to map with each grouped batch later. list(keys = keys, data = data) } diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index b12f7b472ec83..99406443165d5 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -144,7 +144,7 @@ NULL #' @param y Column to compute on. #' @param pos In \itemize{ #' \item \code{locate}: a start position of search. -#' \item \code{overlay}: a start postiton for replacement. +#' \item \code{overlay}: a start position for replacement. #' } #' @param len In \itemize{ #' \item \code{lpad} the maximum length of each output result. @@ -2918,7 +2918,7 @@ setMethod("shiftRight", signature(y = "Column", x = "numeric"), }) #' @details -#' \code{shiftRightUnsigned}: (Unigned) shifts the given value numBits right. If the given value is +#' \code{shiftRightUnsigned}: (Unsigned) shifts the given value numBits right. If the given value is #' a long value, it will return a long value else it will return an integer value. #' #' @rdname column_math_functions diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index ea2c0b4c0f42f..5bc5ae07c5f03 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -289,7 +289,7 @@ sparkCachePath <- function() { } # Length of the Spark cache specific relative path segments for each platform -# eg. "Apache\Spark\Cache" is 3 in Windows, or "spark" is 1 in unix +# e.g. "Apache\Spark\Cache" is 3 in Windows, or "spark" is 1 in unix # Must match sparkCachePath() exactly. sparkCacheRelPathLength <- function() { if (is_windows()) { diff --git a/R/pkg/R/mllib_fpm.R b/R/pkg/R/mllib_fpm.R index 30bc51b932041..65a43514930f0 100644 --- a/R/pkg/R/mllib_fpm.R +++ b/R/pkg/R/mllib_fpm.R @@ -125,7 +125,7 @@ setMethod("spark.freqItemsets", signature(object = "FPGrowthModel"), #' The \code{SparkDataFrame} contains five columns: #' \code{antecedent} (an array of the same type as the input column), #' \code{consequent} (an array of the same type as the input column), -#' \code{condfidence} (confidence for the rule) +#' \code{confidence} (confidence for the rule) #' \code{lift} (lift for the rule) #' and \code{support} (support for the rule) #' @rdname spark.fpGrowth diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R index f6aa48f5fa04a..b5a014b0a3cfd 100644 --- a/R/pkg/R/mllib_tree.R +++ b/R/pkg/R/mllib_tree.R @@ -53,7 +53,7 @@ setClass("DecisionTreeRegressionModel", representation(jobj = "jobj")) #' @note DecisionTreeClassificationModel since 2.3.0 setClass("DecisionTreeClassificationModel", representation(jobj = "jobj")) -# Create the summary of a tree ensemble model (eg. Random Forest, GBT) +# Create the summary of a tree ensemble model (e.g. Random Forest, GBT) summary.treeEnsemble <- function(model) { jobj <- model@jobj formula <- callJMethod(jobj, "formula") @@ -73,7 +73,7 @@ summary.treeEnsemble <- function(model) { jobj = jobj) } -# Prints the summary of tree ensemble models (eg. Random Forest, GBT) +# Prints the summary of tree ensemble models (e.g. Random Forest, GBT) print.summary.treeEnsemble <- function(x) { jobj <- x$jobj cat("Formula: ", x$formula) diff --git a/R/pkg/R/mllib_utils.R b/R/pkg/R/mllib_utils.R index f38f1ac3a6b4c..d943d8d0ab4c0 100644 --- a/R/pkg/R/mllib_utils.R +++ b/R/pkg/R/mllib_utils.R @@ -18,7 +18,7 @@ # mllib_utils.R: Utilities for MLlib integration # Integration with R's standard functions. -# Most of MLlib's argorithms are provided in two flavours: +# Most of MLlib's algorithms are provided in two flavours: # - a specialization of the default R methods (glm). These methods try to respect # the inputs and the outputs of R's method to the largest extent, but some small differences # may exist. diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R index b29381bb900fb..41676be03e951 100644 --- a/R/pkg/R/pairRDD.R +++ b/R/pkg/R/pairRDD.R @@ -239,7 +239,7 @@ setMethod("partitionByRDD", javaPairRDD <- callJMethod(javaPairRDD, "partitionBy", rPartitioner) # Call .values() on the result to get back the final result, the - # shuffled acutal content key-val pairs. + # shuffled actual content key-val pairs. r <- callJMethod(javaPairRDD, "values") RDD(r, serializedMode = "byte") @@ -411,7 +411,7 @@ setMethod("reduceByKeyLocally", #' \itemize{ #' \item createCombiner, which turns a V into a C (e.g., creates a one-element list) #' \item mergeValue, to merge a V into a C (e.g., adds it to the end of a list) - -#' \item mergeCombiners, to combine two C's into a single one (e.g., concatentates +#' \item mergeCombiners, to combine two C's into a single one (e.g., concatenates #' two lists). #' } #' diff --git a/R/pkg/R/streaming.R b/R/pkg/R/streaming.R index 5eccbdc9d3818..2bcfb363f9d24 100644 --- a/R/pkg/R/streaming.R +++ b/R/pkg/R/streaming.R @@ -93,7 +93,7 @@ setMethod("explain", #' lastProgress #' -#' Prints the most recent progess update of this streaming query in JSON format. +#' Prints the most recent progress update of this streaming query in JSON format. #' #' @param x a StreamingQuery. #' @rdname lastProgress diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R index 5d48a9eee2799..dfa83c35665ce 100644 --- a/R/pkg/R/types.R +++ b/R/pkg/R/types.R @@ -68,7 +68,7 @@ rToSQLTypes <- as.environment(list( "character" = "string", "logical" = "boolean")) -# Helper function of coverting decimal type. When backend returns column type in the +# Helper function of converting decimal type. When backend returns column type in the # format of decimal(,) (e.g., decimal(10, 0)), this function coverts the column type # as double type. This function converts backend returned types that are not the key # of PRIMITIVE_TYPES, but should be treated as PRIMITIVE_TYPES. diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index d6f9f927d5cdc..264cbfc9ba929 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -930,7 +930,7 @@ getOne <- function(x, envir, inherits = TRUE, ifnotfound = NULL) { } # Returns a vector of parent directories, traversing up count times, starting with a full path -# eg. traverseParentDirs("/Users/user/Library/Caches/spark/spark2.2", 1) should return +# e.g. traverseParentDirs("/Users/user/Library/Caches/spark/spark2.2", 1) should return # this "/Users/user/Library/Caches/spark/spark2.2" # and "/Users/user/Library/Caches/spark" traverseParentDirs <- function(x, count) { diff --git a/R/pkg/inst/worker/daemon.R b/R/pkg/inst/worker/daemon.R index fb9db63b07cd0..4589bb9c6ad1b 100644 --- a/R/pkg/inst/worker/daemon.R +++ b/R/pkg/inst/worker/daemon.R @@ -32,7 +32,7 @@ inputCon <- socketConnection( SparkR:::doServerAuth(inputCon, Sys.getenv("SPARKR_WORKER_SECRET")) -# Waits indefinitely for a socket connecion by default. +# Waits indefinitely for a socket connection by default. selectTimeout <- NULL while (TRUE) { @@ -72,7 +72,7 @@ while (TRUE) { } }) } else if (is.null(children)) { - # If it is NULL, there are no children. Waits indefinitely for a socket connecion. + # If it is NULL, there are no children. Waits indefinitely for a socket connection. selectTimeout <- NULL } diff --git a/R/pkg/inst/worker/worker.R b/R/pkg/inst/worker/worker.R index 1ef05ea621e83..dd271f91d0084 100644 --- a/R/pkg/inst/worker/worker.R +++ b/R/pkg/inst/worker/worker.R @@ -85,7 +85,7 @@ outputResult <- function(serializer, output, outputCon) { } # Constants -specialLengths <- list(END_OF_STERAM = 0L, TIMING_DATA = -1L) +specialLengths <- list(END_OF_STREAM = 0L, TIMING_DATA = -1L) # Timing R process boot bootTime <- currentTimeSecs() @@ -180,7 +180,7 @@ if (isEmpty != 0) { } else if (deserializer == "arrow" && mode == 1) { data <- SparkR:::readDeserializeInArrow(inputCon) # See https://stat.ethz.ch/pipermail/r-help/2010-September/252046.html - # rbind.fill might be an anternative to make it faster if plyr is installed. + # rbind.fill might be an alternative to make it faster if plyr is installed. # Also, note that, 'dapply' applies a function to each partition. data <- do.call("rbind", data) } @@ -212,7 +212,7 @@ if (isEmpty != 0) { if (serializer == "arrow") { # See https://stat.ethz.ch/pipermail/r-help/2010-September/252046.html - # rbind.fill might be an anternative to make it faster if plyr is installed. + # rbind.fill might be an alternative to make it faster if plyr is installed. combined <- do.call("rbind", outputs) SparkR:::writeSerializeInArrow(outputCon, combined) } @@ -285,7 +285,7 @@ SparkR:::writeDouble(outputCon, computeInputElapsDiff) # compute SparkR:::writeDouble(outputCon, outputComputeElapsDiff) # output # End of output -SparkR:::writeInt(outputCon, specialLengths$END_OF_STERAM) +SparkR:::writeInt(outputCon, specialLengths$END_OF_STREAM) close(outputCon) close(inputCon) diff --git a/R/pkg/tests/fulltests/test_Serde.R b/R/pkg/tests/fulltests/test_Serde.R index e01f6ee005218..a52289e43ca5e 100644 --- a/R/pkg/tests/fulltests/test_Serde.R +++ b/R/pkg/tests/fulltests/test_Serde.R @@ -125,7 +125,7 @@ test_that("SerDe of list of lists", { sparkR.session.stop() -# Note that this test should be at the end of tests since the configruations used here are not +# Note that this test should be at the end of tests since the configurations used here are not # specific to sessions, and the Spark context is restarted. test_that("createDataFrame large objects", { for (encryptionEnabled in list("true", "false")) { diff --git a/R/pkg/tests/fulltests/test_jvm_api.R b/R/pkg/tests/fulltests/test_jvm_api.R index 8b3b4f73de170..3bf6ae556c079 100644 --- a/R/pkg/tests/fulltests/test_jvm_api.R +++ b/R/pkg/tests/fulltests/test_jvm_api.R @@ -20,11 +20,11 @@ context("JVM API") sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) test_that("Create and call methods on object", { - jarr <- sparkR.newJObject("java.util.ArrayList") + jarray <- sparkR.newJObject("java.util.ArrayList") # Add an element to the array - sparkR.callJMethod(jarr, "add", 1L) + sparkR.callJMethod(jarray, "add", 1L) # Check if get returns the same element - expect_equal(sparkR.callJMethod(jarr, "get", 0L), 1L) + expect_equal(sparkR.callJMethod(jarray, "get", 0L), 1L) }) test_that("Call static methods", { diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 81d4e14df791d..833f77786c80b 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -2093,7 +2093,7 @@ test_that("higher order functions", { createDataFrame(data.frame(id = 1)), expr("CAST(array(1.0, 2.0, -3.0, -4.0) AS array) xs"), expr("CAST(array(0.0, 3.0, 48.0) AS array) ys"), - expr("array('FAILED', 'SUCCEDED') as vs"), + expr("array('FAILED', 'SUCCEEDED') as vs"), expr("map('foo', 1, 'bar', 2) as mx"), expr("map('foo', 42, 'bar', -1, 'baz', 0) as my") ) @@ -3667,7 +3667,7 @@ test_that("gapply() and gapplyCollect() on a DataFrame", { } # Computes the arithmetic mean of the second column by grouping - # on the first and third columns. Output the groupping value and the average. + # on the first and third columns. Output the grouping value and the average. schema <- structType(structField("a", "integer"), structField("c", "string"), structField("avg", "double")) df3 <- gapply( @@ -3965,7 +3965,7 @@ test_that("catalog APIs, listTables, listColumns, listFunctions", { paste("Error in listFunctions : analysis error - Database", "'zxwtyswklpf_db' does not exist")) - # recoverPartitions does not work with tempory view + # recoverPartitions does not work with temporary view expect_error(recoverPartitions("cars"), "no such table - Table or view 'cars' not found in database 'default'") expect_error(refreshTable("cars"), NA) diff --git a/R/pkg/tests/fulltests/test_utils.R b/R/pkg/tests/fulltests/test_utils.R index c3fb9046fcda4..6c83a137cfb7b 100644 --- a/R/pkg/tests/fulltests/test_utils.R +++ b/R/pkg/tests/fulltests/test_utils.R @@ -116,7 +116,7 @@ test_that("cleanClosure on R functions", { actual <- get("y", envir = env, inherits = FALSE) expect_equal(actual, y) - # Test for combination for nested and sequenctial functions in a closure + # Test for combination for nested and sequential functions in a closure f1 <- function(x) x + 1 f2 <- function(x) f1(x) + 2 userFunc <- function(x) { f1(x); f2(x) } diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 3713e6c784855..a0608748696a3 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -146,7 +146,7 @@ sparkR.session.stop() Different from many other R packages, to use SparkR, you need an additional installation of Apache Spark. The Spark installation will be used to run a backend process that will compile and execute SparkR programs. -After installing the SparkR package, you can call `sparkR.session` as explained in the previous section to start and it will check for the Spark installation. If you are working with SparkR from an interactive shell (eg. R, RStudio) then Spark is downloaded and cached automatically if it is not found. Alternatively, we provide an easy-to-use function `install.spark` for running this manually. If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](https://spark.apache.org/downloads.html). +After installing the SparkR package, you can call `sparkR.session` as explained in the previous section to start and it will check for the Spark installation. If you are working with SparkR from an interactive shell (e.g. R, RStudio) then Spark is downloaded and cached automatically if it is not found. Alternatively, we provide an easy-to-use function `install.spark` for running this manually. If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](https://spark.apache.org/downloads.html). ```{r, eval=FALSE} install.spark() diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java index d7423537ddfcf..4d7f76f673865 100644 --- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java +++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBTypeInfo.java @@ -133,7 +133,7 @@ class LevelDBTypeInfo { // First create the parent indices, then the child indices. ti.indices().forEach(idx -> { - // In LevelDB, there is no parent index for the NUTURAL INDEX. + // In LevelDB, there is no parent index for the NATURAL INDEX. if (idx.parent().isEmpty() || idx.value().equals(KVIndex.NATURAL_INDEX_NAME)) { indices.put(idx.value(), new Index(idx, ti.getAccessor(idx.value()), null)); } diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java index 6dcc703e92669..eb2882074d7c7 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java +++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java @@ -303,7 +303,7 @@ public void close() { @Override public String toString() { return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) - .append("remoteAdress", channel.remoteAddress()) + .append("remoteAddress", channel.remoteAddress()) .append("clientId", clientId) .append("isActive", isActive()) .toString(); diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java index 64fdb32a67ada..c2b2edc7f07d5 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java +++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthEngine.java @@ -287,7 +287,7 @@ private byte[] doCipherOp(int mode, byte[] in, boolean isFinal) } } } catch (InternalError ie) { - // SPARK-25535. The commons-cryto library will throw InternalError if something goes wrong, + // SPARK-25535. The commons-crypto library will throw InternalError if something goes wrong, // and leave bad state behind in the Java wrappers, so it's not safe to use them afterwards. if (mode == Cipher.ENCRYPT_MODE) { this.encryptor = null; diff --git a/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java b/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java index 0790f0079c2bd..1c2061699a128 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/crypto/AuthEngineSuite.java @@ -150,8 +150,8 @@ public void testEncryptedMessage() throws Exception { ByteArrayWritableChannel channel = new ByteArrayWritableChannel(data.length); TransportCipher.EncryptedMessage emsg = handler.createEncryptedMessage(buf); - while (emsg.transfered() < emsg.count()) { - emsg.transferTo(channel, emsg.transfered()); + while (emsg.transferred() < emsg.count()) { + emsg.transferTo(channel, emsg.transferred()); } assertEquals(data.length, channel.length()); } finally { @@ -196,9 +196,9 @@ public Long answer(InvocationOnMock invocationOnMock) throws Throwable { TransportCipher.EncryptedMessage emsg = handler.createEncryptedMessage(region); ByteArrayWritableChannel channel = new ByteArrayWritableChannel(testDataLength); // "transferTo" should act correctly when the underlying FileRegion transfers 0 bytes. - assertEquals(0L, emsg.transferTo(channel, emsg.transfered())); - assertEquals(testDataLength, emsg.transferTo(channel, emsg.transfered())); - assertEquals(emsg.transfered(), emsg.count()); + assertEquals(0L, emsg.transferTo(channel, emsg.transferred())); + assertEquals(testDataLength, emsg.transferTo(channel, emsg.transferred())); + assertEquals(emsg.transferred(), emsg.count()); assertEquals(4, channel.length()); } finally { client.close(); diff --git a/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java b/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java index 3bff34e210e3c..af1c2878672c0 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java @@ -129,8 +129,8 @@ private void testFileRegionBody(int totalWrites, int writesPerCall) throws Excep private ByteBuf doWrite(MessageWithHeader msg, int minExpectedWrites) throws Exception { int writes = 0; ByteArrayWritableChannel channel = new ByteArrayWritableChannel((int) msg.count()); - while (msg.transfered() < msg.count()) { - msg.transferTo(channel, msg.transfered()); + while (msg.transferred() < msg.count()) { + msg.transferTo(channel, msg.transferred()); writes++; } assertTrue("Not enough writes!", minExpectedWrites <= writes); diff --git a/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java b/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java index ecaeec98da182..32c9acd327213 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java @@ -191,28 +191,28 @@ public void testEncryptedMessage() throws Exception { SaslEncryption.EncryptedMessage emsg = new SaslEncryption.EncryptedMessage(backend, msg, 1024); - long count = emsg.transferTo(channel, emsg.transfered()); + long count = emsg.transferTo(channel, emsg.transferred()); assertTrue(count < data.length); assertTrue(count > 0); // Here, the output buffer is full so nothing should be transferred. - assertEquals(0, emsg.transferTo(channel, emsg.transfered())); + assertEquals(0, emsg.transferTo(channel, emsg.transferred())); // Now there's room in the buffer, but not enough to transfer all the remaining data, // so the dummy count should be returned. channel.reset(); - assertEquals(1, emsg.transferTo(channel, emsg.transfered())); + assertEquals(1, emsg.transferTo(channel, emsg.transferred())); // Eventually, the whole message should be transferred. for (int i = 0; i < data.length / 32 - 2; i++) { channel.reset(); - assertEquals(1, emsg.transferTo(channel, emsg.transfered())); + assertEquals(1, emsg.transferTo(channel, emsg.transferred())); } channel.reset(); - count = emsg.transferTo(channel, emsg.transfered()); + count = emsg.transferTo(channel, emsg.transferred()); assertTrue("Unexpected count: " + count, count > 1 && count < data.length); - assertEquals(data.length, emsg.transfered()); + assertEquals(data.length, emsg.transferred()); } finally { msg.release(); } @@ -237,9 +237,9 @@ public void testEncryptedMessageChunking() throws Exception { new SaslEncryption.EncryptedMessage(backend, msg.convertToNetty(), data.length / 8); ByteArrayWritableChannel channel = new ByteArrayWritableChannel(data.length); - while (emsg.transfered() < emsg.count()) { + while (emsg.transferred() < emsg.count()) { channel.reset(); - emsg.transferTo(channel, emsg.transfered()); + emsg.transferTo(channel, emsg.transferred()); } verify(backend, times(8)).wrap(any(byte[].class), anyInt(), anyInt()); diff --git a/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java b/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java index 45e1836da641f..634b40ed450ee 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java @@ -72,7 +72,7 @@ public void testMissingChunk() { Assert.assertNotNull(getChunk(manager, streamId, 2)); manager.connectionTerminated(dummyChannel); - // loaded buffers are not released yet as in production a MangedBuffer returned by getChunk() + // loaded buffers are not released yet as in production a ManagedBuffer returned by getChunk() // would only be released by Netty after it is written to the network Mockito.verify(buffer1, Mockito.never()).release(); Mockito.verify(buffer2, Mockito.never()).release(); diff --git a/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java b/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java index 4b67aa80351d2..163c52b023822 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/util/TransportFrameDecoderSuite.java @@ -98,7 +98,7 @@ public void testConsolidationPerf() throws Exception { writtenBytes += pieceBytes; } logger.info("Writing 300MiB frame buf with consolidation of threshold " + threshold - + " took " + totalTime + " milis"); + + " took " + totalTime + " millis"); } finally { for (ByteBuf buf : retained) { release(buf); diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/SimpleDownloadFile.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/SimpleDownloadFile.java index 670612fd6f66a..97ecaa627b66c 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/SimpleDownloadFile.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/SimpleDownloadFile.java @@ -32,7 +32,7 @@ * A DownloadFile that does not take any encryption settings into account for reading and * writing data. * - * This does *not* mean the data in the file is un-encrypted -- it could be that the data is + * This does *not* mean the data in the file is unencrypted -- it could be that the data is * already encrypted when its written, and subsequent layer is responsible for decrypting. */ public class SimpleDownloadFile implements DownloadFile { diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index b8dda22240042..c6aa5f0b58285 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -635,13 +635,13 @@ public UTF8String trimLeft() { public UTF8String trimLeft(UTF8String trimString) { if (trimString == null) return null; // the searching byte position in the source string - int srchIdx = 0; + int searchIdx = 0; // the first beginning byte position of a non-matching character int trimIdx = 0; - while (srchIdx < numBytes) { + while (searchIdx < numBytes) { UTF8String searchChar = copyUTF8String( - srchIdx, srchIdx + numBytesForFirstByte(this.getByte(srchIdx)) - 1); + searchIdx, searchIdx + numBytesForFirstByte(this.getByte(searchIdx)) - 1); int searchCharBytes = searchChar.numBytes; // try to find the matching for the searchChar in the trimString set if (trimString.find(searchChar, 0) >= 0) { @@ -650,9 +650,9 @@ public UTF8String trimLeft(UTF8String trimString) { // no matching, exit the search break; } - srchIdx += searchCharBytes; + searchIdx += searchCharBytes; } - if (srchIdx == 0) { + if (searchIdx == 0) { // Nothing trimmed return this; } diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala index 69a082053aa65..ab488e18ba3f4 100644 --- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala +++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala @@ -192,7 +192,7 @@ class UTF8StringPropertyCheckSuite extends AnyFunSuite with ScalaCheckDrivenProp } } - val nullalbeSeq = Gen.listOf(Gen.oneOf[String](null: String, randomString)) + val nullableSeq = Gen.listOf(Gen.oneOf[String](null: String, randomString)) test("concat") { def concat(origin: Seq[String]): String = @@ -201,7 +201,7 @@ class UTF8StringPropertyCheckSuite extends AnyFunSuite with ScalaCheckDrivenProp forAll { (inputs: Seq[String]) => assert(UTF8String.concat(inputs.map(toUTF8): _*) === toUTF8(inputs.mkString)) } - forAll (nullalbeSeq) { (inputs: Seq[String]) => + forAll (nullableSeq) { (inputs: Seq[String]) => assert(UTF8String.concat(inputs.map(toUTF8): _*) === toUTF8(concat(inputs))) } } @@ -216,7 +216,7 @@ class UTF8StringPropertyCheckSuite extends AnyFunSuite with ScalaCheckDrivenProp assert(UTF8String.concatWs(toUTF8(sep), inputs.map(toUTF8): _*) === toUTF8(inputs.mkString(sep))) } - forAll(randomString, nullalbeSeq) {(sep: String, inputs: Seq[String]) => + forAll(randomString, nullableSeq) {(sep: String, inputs: Seq[String]) => assert(UTF8String.concatWs(toUTF8(sep), inputs.map(toUTF8): _*) === toUTF8(concatWs(sep, inputs))) } diff --git a/dev/appveyor-guide.md b/dev/appveyor-guide.md index a8c0c1ef23ac3..c68b5de9e61d0 100644 --- a/dev/appveyor-guide.md +++ b/dev/appveyor-guide.md @@ -33,22 +33,22 @@ Currently, SparkR on Windows is being tested with [AppVeyor](https://ci.appveyor 2016-09-04 11 07 58 -- Click "Github". +- Click "GitHub". 2016-09-04 11 08 10 -#### After signing up, go to profile to link Github and AppVeyor. +#### After signing up, go to profile to link GitHub and AppVeyor. - Click your account and then click "Profile". 2016-09-04 11 09 43 -- Enable the link with GitHub via clicking "Link Github account". +- Enable the link with GitHub via clicking "Link GitHub account". 2016-09-04 11 09 52 -- Click "Authorize application" in Github site. +- Click "Authorize application" in GitHub site. 2016-09-04 11 10 05 @@ -63,11 +63,11 @@ Currently, SparkR on Windows is being tested with [AppVeyor](https://ci.appveyor 2016-08-30 12 16 35 -- Since we will use Github here, click the "GITHUB" button and then click "Authorize Github" so that AppVeyor can access the Github logs (e.g. commits). +- Since we will use GitHub here, click the "GITHUB" button and then click "Authorize GitHub" so that AppVeyor can access the GitHub logs (e.g. commits). 2016-09-04 11 10 22 -- Click "Authorize application" from Github (the above step will pop up this page). +- Click "Authorize application" from GitHub (the above step will pop up this page). 2016-09-04 11 10 27 diff --git a/dev/create-release/known_translations b/dev/create-release/known_translations index ff41cccde0140..64bd9ada1bf61 100644 --- a/dev/create-release/known_translations +++ b/dev/create-release/known_translations @@ -1,5 +1,5 @@ # This is a mapping of names to be translated through translate-contributors.py -# The format expected on each line should be: - +# The format expected on each line should be: - 012huang - Weiyi Huang 07ARB - Ankit Raj Boudh 10110346 - Xian Liu diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 240f4c8dfd371..d2953a86afafd 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -452,7 +452,7 @@ if [[ "$1" == "publish-release" ]]; then if ! is_dry_run; then nexus_upload=$NEXUS_ROOT/deployByRepositoryId/$staged_repo_id - echo "Uplading files to $nexus_upload" + echo "Uploading files to $nexus_upload" for file in $(find . -type f) do # strip leading ./ diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py index cc7ad931198a2..a0e9695d58361 100755 --- a/dev/create-release/releaseutils.py +++ b/dev/create-release/releaseutils.py @@ -110,7 +110,7 @@ def __str__(self): # Under the hood, this runs a `git log` on that tag and parses the fields # from the command output to construct a list of Commit objects. Note that # because certain fields reside in the commit description and cannot be parsed -# through the Github API itself, we need to do some intelligent regex parsing +# through the GitHub API itself, we need to do some intelligent regex parsing # to extract those fields. # # This is written using Git 1.8.5. @@ -140,7 +140,7 @@ def get_commits(tag): sys.exit("Unexpected format in commit: %s" % commit_digest) [_hash, author, title] = commit_digest.split(field_end_marker) # The PR number and github username is in the commit message - # itself and cannot be accessed through any Github API + # itself and cannot be accessed through any GitHub API pr_number = None match = re.search("Closes #([0-9]+) from ([^/\\s]+)/", commit_body) if match: @@ -252,7 +252,7 @@ def nice_join(str_list): return ", ".join(str_list[:-1]) + ", and " + str_list[-1] -# Return the full name of the specified user on Github +# Return the full name of the specified user on GitHub # If the user doesn't exist, return None def get_github_name(author, github_client): if github_client: diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py index 8340266527fc6..be5611ce65a7d 100755 --- a/dev/create-release/translate-contributors.py +++ b/dev/create-release/translate-contributors.py @@ -17,7 +17,7 @@ # This script translates invalid authors in the contributors list generated # by generate-contributors.py. When the script encounters an author name that -# is considered invalid, it searches Github and JIRA in an attempt to search +# is considered invalid, it searches GitHub and JIRA in an attempt to search # for replacements. This tool runs in two modes: # # (1) Interactive mode: For each invalid author name, this script presents @@ -68,7 +68,7 @@ if INTERACTIVE_MODE: print("Running in interactive mode. To disable this, provide the --non-interactive flag.") -# Setup Github and JIRA clients +# Setup GitHub and JIRA clients jira_options = {"server": JIRA_API_BASE} jira_client = JIRA(options=jira_options, basic_auth=(JIRA_USERNAME, JIRA_PASSWORD)) github_client = Github(GITHUB_API_TOKEN) @@ -89,11 +89,11 @@ # Generate candidates for the given author. This should only be called if the given author # name does not represent a full name as this operation is somewhat expensive. Under the -# hood, it makes several calls to the Github and JIRA API servers to find the candidates. +# hood, it makes several calls to the GitHub and JIRA API servers to find the candidates. # # This returns a list of (candidate name, source) 2-tuples. E.g. # [ -# (NOT_FOUND, "No full name found for Github user andrewor14"), +# (NOT_FOUND, "No full name found for GitHub user andrewor14"), # ("Andrew Or", "Full name of JIRA user andrewor14"), # ("Andrew Orso", "Full name of SPARK-1444 assignee andrewor14"), # ("Andrew Ordall", "Full name of SPARK-1663 assignee andrewor14"), @@ -104,12 +104,12 @@ def generate_candidates(author, issues): candidates = [] - # First check for full name of Github user + # First check for full name of GitHub user github_name = get_github_name(author, github_client) if github_name: - candidates.append((github_name, "Full name of Github user %s" % author)) + candidates.append((github_name, "Full name of GitHub user %s" % author)) else: - candidates.append((NOT_FOUND, "No full name found for Github user %s" % author)) + candidates.append((NOT_FOUND, "No full name found for GitHub user %s" % author)) # Then do the same for JIRA user jira_name = get_jira_name(author, jira_client) if jira_name: @@ -151,7 +151,7 @@ def generate_candidates(author, issues): candidates[i] = (candidate, source) return candidates -# Translate each invalid author by searching for possible candidates from Github and JIRA +# Translate each invalid author by searching for possible candidates from GitHub and JIRA # In interactive mode, this script presents the user with a list of choices and have the user # select from this list. Additionally, the user may also choose to enter a custom name. # In non-interactive mode, this script picks the first valid author name from the candidates @@ -180,12 +180,12 @@ def generate_candidates(author, issues): issues = temp_author.split("/")[1:] candidates = generate_candidates(author, issues) # Print out potential replacement candidates along with the sources, e.g. - # [X] No full name found for Github user andrewor14 + # [X] No full name found for GitHub user andrewor14 # [X] No assignee found for SPARK-1763 # [0] Andrew Or - Full name of JIRA user andrewor14 # [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14 # [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14 - # [3] andrewor14 - Raw Github username + # [3] andrewor14 - Raw GitHub username # [4] Custom candidate_names = [] bad_prompts = [] # Prompts that can't actually be selected; print these first. @@ -207,7 +207,7 @@ def generate_candidates(author, issues): print(p) # In interactive mode, additionally provide "custom" option and await user response if INTERACTIVE_MODE: - print(" [%d] %s - Raw Github username" % (raw_index, author)) + print(" [%d] %s - Raw GitHub username" % (raw_index, author)) print(" [%d] Custom" % custom_index) response = raw_input(" Your choice: ") last_index = custom_index diff --git a/dev/github_jira_sync.py b/dev/github_jira_sync.py index 9bcebaa22ab86..27451bba905dd 100755 --- a/dev/github_jira_sync.py +++ b/dev/github_jira_sync.py @@ -16,7 +16,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -# Utility for updating JIRA's with information about Github pull requests +# Utility for updating JIRA's with information about GitHub pull requests import json import os @@ -142,9 +142,9 @@ def reset_pr_labels(pr_num, jira_components): jira_prs = get_jira_prs() previous_max = get_max_pr() -print("Retrieved %s JIRA PR's from Github" % len(jira_prs)) +print("Retrieved %s JIRA PR's from GitHub" % len(jira_prs)) jira_prs = [(k, v) for k, v in jira_prs if int(v['number']) > previous_max] -print("%s PR's remain after excluding visted ones" % len(jira_prs)) +print("%s PR's remain after excluding visited ones" % len(jira_prs)) num_updates = 0 considered = [] @@ -157,7 +157,7 @@ def reset_pr_labels(pr_num, jira_components): considered = considered + [pr_num] url = pr['html_url'] - title = "[Github] Pull Request #%s (%s)" % (pr['number'], pr['user']['login']) + title = "[GitHub] Pull Request #%s (%s)" % (pr['number'], pr['user']['login']) try: page = get_json(get_url(JIRA_API_BASE + "/rest/api/2/issue/" + issue + "/remotelink")) existing_links = map(lambda l: l['object']['url'], page) @@ -174,7 +174,7 @@ def reset_pr_labels(pr_num, jira_components): destination = {"title": title, "url": url, "icon": icon} # For all possible fields see: # https://developer.atlassian.com/display/JIRADEV/Fields+in+Remote+Issue+Links - # application = {"name": "Github pull requests", "type": "org.apache.spark.jira.github"} + # application = {"name": "GitHub pull requests", "type": "org.apache.spark.jira.github"} jira_client.add_remote_link(issue, destination) comment = "User '%s' has created a pull request for this issue:" % pr['user']['login'] diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py index 610fb1fd27027..4309a74773e89 100755 --- a/dev/run-tests-jenkins.py +++ b/dev/run-tests-jenkins.py @@ -38,7 +38,7 @@ def print_err(msg): def post_message_to_github(msg, ghprb_pull_id): - print("Attempting to post to Github...") + print("Attempting to post to GitHub...") api_url = os.getenv("GITHUB_API_BASE", "https://api.github.com/repos/apache/spark") url = api_url + "/issues/" + ghprb_pull_id + "/comments" @@ -57,12 +57,12 @@ def post_message_to_github(msg, ghprb_pull_id): if response.getcode() == 201: print(" > Post successful.") except HTTPError as http_e: - print_err("Failed to post message to Github.") + print_err("Failed to post message to GitHub.") print_err(" > http_code: %s" % http_e.code) print_err(" > api_response: %s" % http_e.read()) print_err(" > data: %s" % posted_message) except URLError as url_e: - print_err("Failed to post message to Github.") + print_err("Failed to post message to GitHub.") print_err(" > urllib_status: %s" % url_e.reason[1]) print_err(" > data: %s" % posted_message) @@ -89,7 +89,7 @@ def run_pr_checks(pr_tests, ghprb_actual_commit, sha1): """ Executes a set of pull request checks to ease development and report issues with various components such as style, linting, dependencies, compatibilities, etc. - @return a list of messages to post back to Github + @return a list of messages to post back to GitHub """ # Ensure we save off the current HEAD to revert to current_pr_head = run_cmd(['git', 'rev-parse', 'HEAD'], return_output=True).strip() @@ -109,7 +109,7 @@ def run_tests(tests_timeout): """ Runs the `dev/run-tests` script and responds with the correct error message under the various failure scenarios. - @return a tuple containing the test result code and the result note to post to Github + @return a tuple containing the test result code and the result note to post to GitHub """ test_result_code = subprocess.Popen(['timeout', @@ -198,16 +198,16 @@ def main(): # To write a PR test: # * the file must reside within the dev/tests directory # * be an executable bash script - # * accept three arguments on the command line, the first being the Github PR long commit - # hash, the second the Github SHA1 hash, and the final the current PR hash + # * accept three arguments on the command line, the first being the GitHub PR long commit + # hash, the second the GitHub SHA1 hash, and the final the current PR hash # * and, lastly, return string output to be included in the pr message output that will - # be posted to Github + # be posted to GitHub pr_tests = [ "pr_merge_ability", "pr_public_classes" ] - # `bind_message_base` returns a function to generate messages for Github posting + # `bind_message_base` returns a function to generate messages for GitHub posting github_message = functools.partial(pr_message, build_display_name, build_url, diff --git a/dev/run-tests.py b/dev/run-tests.py index 6bc73ca3669f3..37a15a758d898 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -642,7 +642,7 @@ def main(): # /home/jenkins/anaconda2/envs/py36/bin os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH") else: - # else we're running locally or Github Actions. + # else we're running locally or GitHub Actions. build_tool = "sbt" hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop3.2") hive_version = os.environ.get("HIVE_PROFILE", "hive2.3") @@ -660,12 +660,12 @@ def main(): included_tags = [] excluded_tags = [] if should_only_test_modules: - # If we're running the tests in Github Actions, attempt to detect and test + # If we're running the tests in GitHub Actions, attempt to detect and test # only the affected modules. if test_env == "github_actions": if os.environ["GITHUB_INPUT_BRANCH"] != "": # Dispatched request - # Note that it assumes Github Actions has already merged + # Note that it assumes GitHub Actions has already merged # the given `GITHUB_INPUT_BRANCH` branch. changed_files = identify_changed_files_from_git_commits( "HEAD", target_branch=os.environ["GITHUB_SHA"]) diff --git a/dev/tests/pr_merge_ability.sh b/dev/tests/pr_merge_ability.sh index 25fdbccac4dd8..a32667730f76c 100755 --- a/dev/tests/pr_merge_ability.sh +++ b/dev/tests/pr_merge_ability.sh @@ -22,7 +22,7 @@ # another branch and returning results to be published. More details can be # found at dev/run-tests-jenkins. # -# Arg1: The Github Pull Request Actual Commit +# Arg1: The GitHub Pull Request Actual Commit # known as `ghprbActualCommit` in `run-tests-jenkins` # Arg2: The SHA1 hash # known as `sha1` in `run-tests-jenkins` diff --git a/dev/tests/pr_public_classes.sh b/dev/tests/pr_public_classes.sh index 479d1851fe0b8..ad1ad5e736594 100755 --- a/dev/tests/pr_public_classes.sh +++ b/dev/tests/pr_public_classes.sh @@ -22,7 +22,7 @@ # another branch and returning results to be published. More details can be # found at dev/run-tests-jenkins. # -# Arg1: The Github Pull Request Actual Commit +# Arg1: The GitHub Pull Request Actual Commit # known as `ghprbActualCommit` in `run-tests-jenkins` ghprbActualCommit="$1" diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 98769d951b6ac..5a66bfca27a27 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -1729,7 +1729,7 @@ object MimaExcludes { ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"), ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.setFeatureSubsetStrategy") ) ++ Seq( - // [SPARK-21680][ML][MLLIB]optimzie Vector coompress + // [SPARK-21680][ML][MLLIB]optimize Vector compress ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.mllib.linalg.Vector.toSparseWithSize"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Vector.toSparseWithSize") ) ++ Seq( diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 05413b7091ad9..a5951e0452943 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -198,7 +198,7 @@ object SparkBuild extends PomBuild { ) // Silencer: Scala compiler plugin for warning suppression - // Aim: enable fatal warnings, but supress ones related to using of deprecated APIs + // Aim: enable fatal warnings, but suppress ones related to using of deprecated APIs // depends on scala version: // <2.13 - silencer 1.6.0 and compiler settings to enable fatal warnings // 2.13.0,2.13.1 - silencer 1.7.1 and compiler settings to enable fatal warnings @@ -222,7 +222,7 @@ object SparkBuild extends PomBuild { "-Xfatal-warnings", "-deprecation", "-Ywarn-unused-import", - "-P:silencer:globalFilters=.*deprecated.*" //regex to catch deprecation warnings and supress them + "-P:silencer:globalFilters=.*deprecated.*" //regex to catch deprecation warnings and suppress them ) } else { Seq( @@ -327,7 +327,7 @@ object SparkBuild extends PomBuild { // to be enabled in specific ones that have previous artifacts MimaKeys.mimaFailOnNoPrevious := false, - // To prevent intermittent compliation failures, see also SPARK-33297 + // To prevent intermittent compilation failures, see also SPARK-33297 // Apparently we can remove this when we use JDK 11. Test / classLoaderLayeringStrategy := ClassLoaderLayeringStrategy.Flat ) diff --git a/python/docs/source/_static/css/pyspark.css b/python/docs/source/_static/css/pyspark.css index 2fd8720e2fa0d..1e493c4c868e6 100644 --- a/python/docs/source/_static/css/pyspark.css +++ b/python/docs/source/_static/css/pyspark.css @@ -51,7 +51,7 @@ h3 { max-width: 80%; } -/* Left pannel size */ +/* Left panel size */ @media (min-width: 768px) { .col-md-3 { flex: 0 0 20%; diff --git a/python/docs/source/_templates/autosummary/class.rst b/python/docs/source/_templates/autosummary/class.rst index d794f797ee2ad..b5f62677ee0ed 100644 --- a/python/docs/source/_templates/autosummary/class.rst +++ b/python/docs/source/_templates/autosummary/class.rst @@ -15,7 +15,7 @@ specific language governing permissions and limitations under the License. -.. Workaround to avoud documenting __init__. +.. Workaround to avoid documenting __init__. {% extends "!autosummary/class.rst" %} diff --git a/python/docs/source/development/debugging.rst b/python/docs/source/development/debugging.rst index bc141a6f44a6f..829919858f67a 100644 --- a/python/docs/source/development/debugging.rst +++ b/python/docs/source/development/debugging.rst @@ -54,7 +54,7 @@ Enter the name of this new configuration, for example, ``MyRemoteDebugger`` and .. image:: ../../../../docs/img/pyspark-remote-debug1.png :alt: PyCharm remote debugger setting -| After that, you should install the corresponding version of the ``pydevd-pycahrm`` package in all the machines which will connect to your PyCharm debugger. In the previous dialog, it shows the command to install. +| After that, you should install the corresponding version of the ``pydevd-pycharm`` package in all the machines which will connect to your PyCharm debugger. In the previous dialog, it shows the command to install. .. code-block:: text diff --git a/python/docs/source/development/testing.rst b/python/docs/source/development/testing.rst index 08fd730a19f4b..3eab8d04511d6 100644 --- a/python/docs/source/development/testing.rst +++ b/python/docs/source/development/testing.rst @@ -53,5 +53,5 @@ Running tests using GitHub Actions ---------------------------------- You can run the full PySpark tests by using GitHub Actions in your own forked GitHub -repositry with a few clicks. Please refer to +repository with a few clicks. Please refer to `Running tests in your forked repository using GitHub Actions `_ for more details. diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index 9c9ff7fa7844b..a90f5fe159553 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -42,7 +42,7 @@ PySpark installation using `PyPI `_ is as fol pip install pyspark -If you want to install extra dependencies for a specific componenet, you can install it as below: +If you want to install extra dependencies for a specific component, you can install it as below: .. code-block:: bash @@ -105,7 +105,7 @@ Now activate the newly created environment with the following command: conda activate pyspark_env You can install pyspark by `Using PyPI <#using-pypi>`_ to install PySpark in the newly created -environment, for example as below. It will install PySpark under the new virtual environemnt +environment, for example as below. It will install PySpark under the new virtual environment ``pyspark_env`` created above. .. code-block:: bash @@ -126,7 +126,7 @@ Manually Downloading -------------------- PySpark is included in the distributions available at the `Apache Spark website `_. -You can download a distribution you want from the site. After that, uncompress the tar file into the directoy where you want +You can download a distribution you want from the site. After that, uncompress the tar file into the directory where you want to install Spark, for example, as below: .. code-block:: bash diff --git a/python/docs/source/getting_started/quickstart.ipynb b/python/docs/source/getting_started/quickstart.ipynb index ab3645591955f..550b532fefc14 100644 --- a/python/docs/source/getting_started/quickstart.ipynb +++ b/python/docs/source/getting_started/quickstart.ipynb @@ -11,7 +11,7 @@ "\n", "There is also other useful information in Apache Spark documentation site, see the latest version of [Spark SQL and DataFrames](https://spark.apache.org/docs/latest/sql-programming-guide.html), [RDD Programming Guide](https://spark.apache.org/docs/latest/rdd-programming-guide.html), [Structured Streaming Programming Guide](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html), [Spark Streaming Programming Guide](https://spark.apache.org/docs/latest/streaming-programming-guide.html) and [Machine Learning Library (MLlib) Guide](https://spark.apache.org/docs/latest/ml-guide.html).\n", "\n", - "PySaprk applications start with initializing `SparkSession` which is the entry point of PySpark as below. In case of running it in PySpark shell via pyspark executable, the shell automatically creates the session in the variable spark for users." + "PySpark applications start with initializing `SparkSession` which is the entry point of PySpark as below. In case of running it in PySpark shell via pyspark executable, the shell automatically creates the session in the variable spark for users." ] }, { @@ -392,7 +392,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`DataFrame.collect()` collects the distributed data to the driver side as the local data in Python. Note that this can throw an out-of-memory error when the dataset is too larget to fit in the driver side because it collects all the data from executors to the driver side." + "`DataFrame.collect()` collects the distributed data to the driver side as the local data in Python. Note that this can throw an out-of-memory error when the dataset is too large to fit in the driver side because it collects all the data from executors to the driver side." ] }, { diff --git a/python/docs/source/index.rst b/python/docs/source/index.rst index 4286f616374c5..6a631052a642d 100644 --- a/python/docs/source/index.rst +++ b/python/docs/source/index.rst @@ -30,7 +30,7 @@ of Spark's features such as Spark SQL, DataFrame, Streaming, MLlib (Machine Learning) and Spark Core. .. image:: ../../../docs/img/pyspark-components.png - :alt: PySpark Compoenents + :alt: PySpark Components **Spark SQL and DataFrame** diff --git a/python/pyspark/__init__.pyi b/python/pyspark/__init__.pyi index 98bd40684c01b..ef07c32b1db7b 100644 --- a/python/pyspark/__init__.pyi +++ b/python/pyspark/__init__.pyi @@ -53,7 +53,7 @@ from pyspark.taskcontext import ( # noqa: F401 ) from pyspark.util import InheritableThread as InheritableThread # noqa: F401 -# Compatiblity imports +# Compatibility imports from pyspark.sql import ( # noqa: F401 SQLContext as SQLContext, HiveContext as HiveContext, diff --git a/python/pyspark/cloudpickle/cloudpickle.py b/python/pyspark/cloudpickle/cloudpickle.py index 8e683e7a6988b..58c274bd79720 100644 --- a/python/pyspark/cloudpickle/cloudpickle.py +++ b/python/pyspark/cloudpickle/cloudpickle.py @@ -88,7 +88,7 @@ def g(): DEFAULT_PROTOCOL = pickle.HIGHEST_PROTOCOL # Track the provenance of reconstructed dynamic classes to make it possible to -# recontruct instances from the matching singleton class definition when +# reconstruct instances from the matching singleton class definition when # appropriate and preserve the usual "isinstance" semantics of Python objects. _DYNAMIC_CLASS_TRACKER_BY_CLASS = weakref.WeakKeyDictionary() _DYNAMIC_CLASS_TRACKER_BY_ID = weakref.WeakValueDictionary() @@ -236,7 +236,7 @@ def _extract_code_globals(co): out_names = {names[oparg] for _, oparg in _walk_global_ops(co)} # Declaring a function inside another one using the "def ..." - # syntax generates a constant code object corresonding to the one + # syntax generates a constant code object corresponding to the one # of the nested function's As the nested function may itself need # global variables, we need to introspect its code, extract its # globals, (look for code object in it's co_consts attribute..) and @@ -457,7 +457,7 @@ def _is_parametrized_type_hint(obj): is_typing = getattr(obj, '__origin__', None) is not None # typing_extensions.Literal - is_litteral = getattr(obj, '__values__', None) is not None + is_literal = getattr(obj, '__values__', None) is not None # typing_extensions.Final is_final = getattr(obj, '__type__', None) is not None @@ -469,7 +469,7 @@ def _is_parametrized_type_hint(obj): getattr(obj, '__result__', None) is not None and getattr(obj, '__args__', None) is not None ) - return any((is_typing, is_litteral, is_final, is_union, is_tuple, + return any((is_typing, is_literal, is_final, is_union, is_tuple, is_callable)) def _create_parametrized_type_hint(origin, args): @@ -699,7 +699,7 @@ def _make_skel_func(code, cell_count, base_globals=None): """ # This function is deprecated and should be removed in cloudpickle 1.7 warnings.warn( - "A pickle file created using an old (<=1.4.1) version of cloudpicke " + "A pickle file created using an old (<=1.4.1) version of cloudpickle " "is currently being loaded. This is not supported by cloudpickle and " "will break in cloudpickle 1.7", category=UserWarning ) diff --git a/python/pyspark/cloudpickle/cloudpickle_fast.py b/python/pyspark/cloudpickle/cloudpickle_fast.py index e8e46b88fdc91..3c48ff7b0a885 100644 --- a/python/pyspark/cloudpickle/cloudpickle_fast.py +++ b/python/pyspark/cloudpickle/cloudpickle_fast.py @@ -6,7 +6,7 @@ is only available for Python versions 3.8+, a lot of backward-compatibility code is also removed. -Note that the C Pickler sublassing API is CPython-specific. Therefore, some +Note that the C Pickler subclassing API is CPython-specific. Therefore, some guards present in cloudpickle.py that were written to handle PyPy specificities are not present in cloudpickle_fast.py """ @@ -179,7 +179,7 @@ def _class_getstate(obj): clsdict.pop('__weakref__', None) if issubclass(type(obj), abc.ABCMeta): - # If obj is an instance of an ABCMeta subclass, dont pickle the + # If obj is an instance of an ABCMeta subclass, don't pickle the # cache/negative caches populated during isinstance/issubclass # checks, but pickle the list of registered subclasses of obj. clsdict.pop('_abc_cache', None) @@ -407,7 +407,7 @@ def _class_reduce(obj): def _function_setstate(obj, state): - """Update the state of a dynaamic function. + """Update the state of a dynamic function. As __closure__ and __globals__ are readonly attributes of a function, we cannot rely on the native setstate routine of pickle.load_build, that calls @@ -556,7 +556,7 @@ def dump(self, obj): # `dispatch` attribute. Earlier versions of the protocol 5 CloudPickler # used `CloudPickler.dispatch` as a class-level attribute storing all # reducers implemented by cloudpickle, but the attribute name was not a - # great choice given the meaning of `Cloudpickler.dispatch` when + # great choice given the meaning of `CloudPickler.dispatch` when # `CloudPickler` extends the pure-python pickler. dispatch = dispatch_table @@ -630,7 +630,7 @@ def reducer_override(self, obj): return self._function_reduce(obj) else: # fallback to save_global, including the Pickler's - # distpatch_table + # dispatch_table return NotImplemented else: diff --git a/python/pyspark/context.py b/python/pyspark/context.py index 1bd5961e0525a..1c542fa897ece 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -260,7 +260,7 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, sys.path.insert(1, filepath) except Exception: warnings.warn( - "Failed to add file [%s] speficied in 'spark.submit.pyFiles' to " + "Failed to add file [%s] specified in 'spark.submit.pyFiles' to " "Python path:\n %s" % (path, "\n ".join(sys.path)), RuntimeWarning) @@ -603,7 +603,7 @@ def _serialize_to_jvm(self, data, serializer, reader_func, createRDDServer): tempFile.close() return reader_func(tempFile.name) finally: - # we eagerily reads the file so we can delete right after. + # we eagerly reads the file so we can delete right after. os.unlink(tempFile.name) def pickleFile(self, name, minPartitions=None): diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py index fe2e326dff8be..cc0c3a8888a66 100644 --- a/python/pyspark/java_gateway.py +++ b/python/pyspark/java_gateway.py @@ -208,7 +208,7 @@ def local_connect_and_auth(port, auth_secret): return (sockfile, sock) except socket.error as e: emsg = str(e) - errors.append("tried to connect to %s, but an error occured: %s" % (sa, emsg)) + errors.append("tried to connect to %s, but an error occurred: %s" % (sa, emsg)) sock.close() sock = None raise Exception("could not open socket: %s" % errors) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 82b9a6db1eb92..8138f34d7a19e 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -5798,7 +5798,7 @@ def setHandleInvalid(self, value): class _VarianceThresholdSelectorParams(HasFeaturesCol, HasOutputCol): """ Params for :py:class:`VarianceThresholdSelector` and - :py:class:`VarianceThresholdSelectorrModel`. + :py:class:`VarianceThresholdSelectorModel`. .. versionadded:: 3.1.0 """ diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 5ce484d964a5a..d37654a7388f5 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -1491,7 +1491,7 @@ def setParams(self, *, featuresCol="features", labelCol="label", predictionCol=" maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, - impuriy="variance", featureSubsetStrategy="all", validationTol=0.01, + impurity="variance", featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None, leafCol="", minWeightFractionPerNode=0.0, weightCol=None): """ diff --git a/python/pyspark/ml/regression.pyi b/python/pyspark/ml/regression.pyi index b8f1e61859c72..61172305a3726 100644 --- a/python/pyspark/ml/regression.pyi +++ b/python/pyspark/ml/regression.pyi @@ -477,7 +477,7 @@ class GBTRegressor( maxIter: int = ..., stepSize: float = ..., seed: Optional[int] = ..., - impuriy: str = ..., + impurity: str = ..., featureSubsetStrategy: str = ..., validationTol: float = ..., validationIndicatorCol: Optional[str] = ..., diff --git a/python/pyspark/ml/tests/test_algorithms.py b/python/pyspark/ml/tests/test_algorithms.py index f8b61b7c57919..50475210607c8 100644 --- a/python/pyspark/ml/tests/test_algorithms.py +++ b/python/pyspark/ml/tests/test_algorithms.py @@ -116,7 +116,7 @@ def test_output_columns(self): output = model.transform(df) self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"]) - def test_parallelism_doesnt_change_output(self): + def test_parallelism_does_not_change_output(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], diff --git a/python/pyspark/ml/tests/test_image.py b/python/pyspark/ml/tests/test_image.py index ceecdae971c99..1001598779d48 100644 --- a/python/pyspark/ml/tests/test_image.py +++ b/python/pyspark/ml/tests/test_image.py @@ -33,7 +33,7 @@ def test_read_images(self): self.assertEqual(df.count(), 4) first_row = df.take(1)[0][0] # compare `schema.simpleString()` instead of directly compare schema, - # because the df loaded from datasouce may change schema column nullability. + # because the df loaded from datasource may change schema column nullability. self.assertEqual(df.schema.simpleString(), ImageSchema.imageSchema.simpleString()) self.assertEqual(df.schema["image"].dataType.simpleString(), ImageSchema.columnSchema.simpleString()) diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index e1a009643c5f2..cfc18c057f0a8 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -927,7 +927,7 @@ def setInitialCenters(self, centers, weights): @since('1.5.0') def setRandomCenters(self, dim, weight, seed): """ - Set the initial centres to be random samples from + Set the initial centers to be random samples from a gaussian population with constant weights. """ rng = random.RandomState(seed) diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py index 198a9791774a9..2f25c7672a93a 100644 --- a/python/pyspark/mllib/evaluation.py +++ b/python/pyspark/mllib/evaluation.py @@ -457,7 +457,7 @@ def meanAveragePrecision(self): """ Returns the mean average precision (MAP) of all the queries. If a query has an empty ground truth set, the average precision will be zero and - a log warining is generated. + a log warning is generated. """ return self.call("meanAveragePrecision") @@ -466,7 +466,7 @@ def meanAveragePrecisionAt(self, k): """ Returns the mean average precision (MAP) at first k ranking of all the queries. If a query has an empty ground truth set, the average precision will be zero and - a log warining is generated. + a log warning is generated. """ return self.call("meanAveragePrecisionAt", int(k)) diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index e549b0ac43721..c224e38473cf6 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -787,7 +787,7 @@ def _validate(self, dstream): "dstream should be a DStream object, got %s" % type(dstream)) if not self._model: raise ValueError( - "Model must be intialized using setInitialWeights") + "Model must be initialized using setInitialWeights") def predictOn(self, dstream): """ diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py index a4b45cf55febe..d8f3cb840e45c 100644 --- a/python/pyspark/mllib/stat/_statistics.py +++ b/python/pyspark/mllib/stat/_statistics.py @@ -178,7 +178,7 @@ def chiSqTest(observed, expected=None): """ If `observed` is Vector, conduct Pearson's chi-squared goodness of fit test of the observed data against the expected distribution, - or againt the uniform distribution (by default), with each category + or against the uniform distribution (by default), with each category having an expected frequency of `1 / len(observed)`. If `observed` is matrix, conduct Pearson's independence test on the diff --git a/python/pyspark/mllib/tests/test_streaming_algorithms.py b/python/pyspark/mllib/tests/test_streaming_algorithms.py index b94fb2778d88d..f6c6779e83f13 100644 --- a/python/pyspark/mllib/tests/test_streaming_algorithms.py +++ b/python/pyspark/mllib/tests/test_streaming_algorithms.py @@ -189,7 +189,7 @@ def generateLogisticInput(offset, scale, nPoints, seed): Generate 1 / (1 + exp(-x * scale + offset)) where, - x is randomnly distributed and the threshold + x is randomly distributed and the threshold and labels for each sample in x is obtained from a random uniform distribution. """ diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 1964070040cdf..34faaacff5eb3 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -1253,7 +1253,7 @@ def histogram(self, buckets): and 50 we would have a histogram of 1,0,1. If your histogram is evenly spaced (e.g. [0, 10, 20, 30]), - this can be switched from an O(log n) inseration to O(1) per + this can be switched from an O(log n) insertion to O(1) per element (where n is the number of buckets). Buckets must be sorted, not contain any duplicates, and have @@ -2292,7 +2292,7 @@ def groupWith(self, other, *others): """ return python_cogroup((self, other) + others, numPartitions=None) - # TODO: add variant with custom parittioner + # TODO: add variant with custom partitioner def cogroup(self, other, numPartitions=None): """ For each key k in `self` or `other`, return a resulting RDD that diff --git a/python/pyspark/resource/requests.py b/python/pyspark/resource/requests.py index 74d26d04312c4..4deb22b5948f0 100644 --- a/python/pyspark/resource/requests.py +++ b/python/pyspark/resource/requests.py @@ -189,7 +189,7 @@ def requests(self): class TaskResourceRequest(object): """ - A task resource request. This is used in conjuntion with the + A task resource request. This is used in conjunction with the :class:`pyspark.resource.ResourceProfile` to programmatically specify the resources needed for an RDD that will be applied at the stage level. The amount is specified as a Double to allow for saying you want more than 1 task per resource. Valid values @@ -226,7 +226,7 @@ def amount(self): class TaskResourceRequests(object): """ - A set of task resource requests. This is used in conjuntion with the + A set of task resource requests. This is used in conjunction with the :class:`pyspark.resource.ResourceProfileBuilder` to programmatically specify the resources needed for an RDD that will be applied at the stage level. diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py index 89be6295f9888..4ba846227188c 100644 --- a/python/pyspark/shuffle.py +++ b/python/pyspark/shuffle.py @@ -418,7 +418,7 @@ def _cleanup(self): class ExternalSorter(object): """ - ExtenalSorter will divide the elements into chunks, sort them in + ExternalSorter will divide the elements into chunks, sort them in memory and dump them into disks, finally merge them back. The spilling will only happen when the used memory goes above diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 345e81bd2d73e..760805400aca9 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -425,7 +425,7 @@ def dropFields(self, *fieldNames): +--------------+ However, if you are going to add/replace multiple nested fields, - it is preffered to extract out the nested struct before + it is preferred to extract out the nested struct before adding/replacing multiple fields e.g. >>> df.select(col("a").withField( diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 9fae27a2d9c6c..fe7d26d1bcfd2 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1497,7 +1497,7 @@ def summary(self, *statistics): - stddev - min - max - - arbitrary approximate percentiles specified as a percentage (eg, 75%) + - arbitrary approximate percentiles specified as a percentage (e.g., 75%) If no statistics are given, this function computes count, mean, stddev, min, approximate quartiles (percentiles at 25%, 50%, and 75%), and max. diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index ea91e8593e21f..4dc3129fd6bc2 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1300,7 +1300,7 @@ def spark_partition_id(): Notes ----- - This is indeterministic because it depends on data partitioning and task scheduling. + This is non deterministic because it depends on data partitioning and task scheduling. Examples -------- @@ -4110,7 +4110,7 @@ def _get_lambda_parameters(f): # We should exclude functions that use # variable args and keyword argnames # as well as keyword only args - supported_parmeter_types = { + supported_parameter_types = { inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.POSITIONAL_ONLY, } @@ -4125,7 +4125,7 @@ def _get_lambda_parameters(f): ) # and all arguments can be used as positional - if not all(p.kind in supported_parmeter_types for p in parameters): + if not all(p.kind in supported_parameter_types for p in parameters): raise ValueError( "f should use only POSITIONAL or POSITIONAL OR KEYWORD arguments" ) @@ -4640,7 +4640,7 @@ def years(col): Notes ----- - This function can be used only in combinatiion with + This function can be used only in combination with :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy` method of the `DataFrameWriterV2`. @@ -4664,7 +4664,7 @@ def months(col): Notes ----- - This function can be used only in combinatiion with + This function can be used only in combination with :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy` method of the `DataFrameWriterV2`. @@ -4688,7 +4688,7 @@ def days(col): Notes ----- - This function can be used only in combinatiion with + This function can be used only in combination with :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy` method of the `DataFrameWriterV2`. @@ -4712,7 +4712,7 @@ def hours(col): Notes ----- - This function can be used only in combinatiion with + This function can be used only in combination with :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy` method of the `DataFrameWriterV2`. diff --git a/python/pyspark/sql/pandas/_typing/protocols/frame.pyi b/python/pyspark/sql/pandas/_typing/protocols/frame.pyi index de679ee2cd017..9148e7a2dca8e 100644 --- a/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +++ b/python/pyspark/sql/pandas/_typing/protocols/frame.pyi @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -# This Protocol resuses core Pandas annotation. +# This Protocol reuses core Pandas annotation. # Overall pipeline looks as follows # - Stubgen pandas.core.frame # - Add Protocol as a base class diff --git a/python/pyspark/sql/pandas/_typing/protocols/series.pyi b/python/pyspark/sql/pandas/_typing/protocols/series.pyi index 14babb067da0d..f2de2e8b129fd 100644 --- a/python/pyspark/sql/pandas/_typing/protocols/series.pyi +++ b/python/pyspark/sql/pandas/_typing/protocols/series.pyi @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -# This Protocol resuses core Pandas annotation. +# This Protocol reuses core Pandas annotation. # Overall pipeline looks as follows # - Stubgen pandas.core.series # - Add Protocol as a base class diff --git a/python/pyspark/sql/pandas/functions.py b/python/pyspark/sql/pandas/functions.py index 750aa4b0e6c56..4cd0b196d3366 100644 --- a/python/pyspark/sql/pandas/functions.py +++ b/python/pyspark/sql/pandas/functions.py @@ -99,7 +99,7 @@ def pandas_udf(f=None, returnType=None, functionType=None): ... s3['col2'] = s1 + s2.str.len() ... return s3 ... - >>> # Create a Spark DataFrame that has three columns including a sturct column. + >>> # Create a Spark DataFrame that has three columns including a struct column. ... df = spark.createDataFrame( ... [[1, "a string", ("a nested string",)]], ... "long_col long, string_col string, struct_col struct") @@ -114,7 +114,7 @@ def pandas_udf(f=None, returnType=None, functionType=None): | |-- col1: string (nullable = true) | |-- col2: long (nullable = true) - In the following sections, it describes the cominations of the supported type hints. For + In the following sections, it describes the combinations of the supported type hints. For simplicity, `pandas.DataFrame` variant is omitted. * Series to Series diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py index ee68b95fc478d..a639a8d51f55c 100644 --- a/python/pyspark/sql/tests/test_pandas_grouped_map.py +++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py @@ -484,7 +484,7 @@ def dummy_pandas_udf(df): col('temp0.key') == col('temp1.key')) self.assertEquals(res.count(), 5) - def test_mixed_scalar_udfs_followed_by_grouby_apply(self): + def test_mixed_scalar_udfs_followed_by_groupby_apply(self): df = self.spark.range(0, 10).toDF('v1') df = df.withColumn('v2', udf(lambda x: x + 1, 'int')(df['v1'])) \ .withColumn('v3', pandas_udf(lambda x: x + 2, 'int')(df['v1'])) diff --git a/python/pyspark/sql/tests/test_udf.py b/python/pyspark/sql/tests/test_udf.py index a7dcbfd32ac1c..9a1c0edcce4ed 100644 --- a/python/pyspark/sql/tests/test_udf.py +++ b/python/pyspark/sql/tests/test_udf.py @@ -459,7 +459,7 @@ def test_udf_with_string_return_type(self): self.assertTupleEqual(expected, actual) - def test_udf_shouldnt_accept_noncallable_object(self): + def test_udf_should_not_accept_noncallable_object(self): non_callable = None self.assertRaises(TypeError, UserDefinedFunction, non_callable, StringType()) @@ -683,7 +683,7 @@ def tearDown(self): if SparkContext._active_spark_context is not None: SparkContext._active_spark_context.stop() - def test_udf_init_shouldnt_initialize_context(self): + def test_udf_init_should_not_initialize_context(self): UserDefinedFunction(lambda x: x, StringType()) self.assertIsNone( diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py index 18f8ba29f95a2..f5db783d2b5bc 100644 --- a/python/pyspark/sql/utils.py +++ b/python/pyspark/sql/utils.py @@ -151,10 +151,10 @@ def toJArray(gateway, jtype, arr): arr : python type list """ - jarr = gateway.new_array(jtype, len(arr)) + jarray = gateway.new_array(jtype, len(arr)) for i in range(0, len(arr)): - jarr[i] = arr[i] - return jarr + jarray[i] = arr[i] + return jarray def require_test_compiled(): diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py index c4dc0d3af3332..2e6d7ede88551 100644 --- a/python/pyspark/streaming/context.py +++ b/python/pyspark/streaming/context.py @@ -281,7 +281,7 @@ def socketTextStream(self, hostname, port, storageLevel=StorageLevel.MEMORY_AND_ def textFileStream(self, directory): """ Create an input stream that monitors a Hadoop-compatible file system - for new files and reads them as text files. Files must be wrriten to the + for new files and reads them as text files. Files must be written to the monitored directory by "moving" them from another location within the same file system. File names starting with . are ignored. The text files must be encoded as UTF-8. diff --git a/python/pyspark/tests/test_context.py b/python/pyspark/tests/test_context.py index d86f6c3c1571c..8397ef1c4b62d 100644 --- a/python/pyspark/tests/test_context.py +++ b/python/pyspark/tests/test_context.py @@ -175,8 +175,8 @@ def test_parallelize_eager_cleanup(self): with SparkContext() as sc: temp_files = os.listdir(sc._temp_dir) rdd = sc.parallelize([0, 1, 2]) - post_parallalize_temp_files = os.listdir(sc._temp_dir) - self.assertEqual(temp_files, post_parallalize_temp_files) + post_parallelize_temp_files = os.listdir(sc._temp_dir) + self.assertEqual(temp_files, post_parallelize_temp_files) def test_set_conf(self): # This is for an internal use case. When there is an existing SparkContext, diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 1b09d327a5dfe..8ca4bb37e5fa4 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -59,7 +59,7 @@ def report_times(outfile, boot, init, finish): def add_path(path): - # worker can be used, so donot add path multiple times + # worker can be used, so do not add path multiple times if path not in sys.path: # overwrite system packages sys.path.insert(1, path) diff --git a/python/test_support/userlibrary.py b/python/test_support/userlibrary.py index 73fd26e71f10d..90cd30723ddfe 100755 --- a/python/test_support/userlibrary.py +++ b/python/test_support/userlibrary.py @@ -16,7 +16,7 @@ # """ -Used to test shipping of code depenencies with SparkContext.addPyFile(). +Used to test shipping of code dependencies with SparkContext.addPyFile(). """ diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala index e3af1ccc24f1c..41194f3a2676f 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala @@ -420,7 +420,7 @@ private[spark] object Config extends Logging { val KUBERNETES_FILE_UPLOAD_PATH = ConfigBuilder("spark.kubernetes.file.upload.path") .doc("Hadoop compatible file system path where files from the local file system " + - "will be uploded to in cluster mode.") + "will be uploaded to in cluster mode.") .version("3.0.0") .stringConf .createOptional diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshotsStoreImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshotsStoreImpl.scala index 3f2cb485bbb31..22764d9d2eb0e 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshotsStoreImpl.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshotsStoreImpl.scala @@ -52,7 +52,7 @@ import org.apache.spark.util.ThreadUtils * time-windowed chunks. Each subscriber can choose to receive their snapshot chunks at different * time intervals. *
      - * The subcriber notification callback is guaranteed to be called from a single thread at a time. + * The subscriber notification callback is guaranteed to be called from a single thread at a time. */ private[spark] class ExecutorPodsSnapshotsStoreImpl(subscribersExecutor: ScheduledExecutorService) extends ExecutorPodsSnapshotsStore with Logging { @@ -142,7 +142,7 @@ private[spark] class ExecutorPodsSnapshotsStoreImpl(subscribersExecutor: Schedul } if (notificationCount.decrementAndGet() > 0) { - // There was another concurrent request for this subcriber. Schedule a task to + // There was another concurrent request for this subscriber. Schedule a task to // immediately process snapshots again, so that the subscriber can pick up any // changes that may have happened between the time it started looking at snapshots // above, and the time the concurrent request arrived. diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesVolumeUtilsSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesVolumeUtilsSuite.scala index 349cbd04f6027..156740d7c8aee 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesVolumeUtilsSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesVolumeUtilsSuite.scala @@ -49,14 +49,14 @@ class KubernetesVolumeUtilsSuite extends SparkFunSuite { val sparkConf = new SparkConf(false) sparkConf.set("test.persistentVolumeClaim.volumeName.mount.path", "/path") sparkConf.set("test.persistentVolumeClaim.volumeName.mount.readOnly", "true") - sparkConf.set("test.persistentVolumeClaim.volumeName.options.claimName", "claimeName") + sparkConf.set("test.persistentVolumeClaim.volumeName.options.claimName", "claimName") val volumeSpec = KubernetesVolumeUtils.parseVolumesWithPrefix(sparkConf, "test.").head assert(volumeSpec.volumeName === "volumeName") assert(volumeSpec.mountPath === "/path") assert(volumeSpec.mountReadOnly) assert(volumeSpec.volumeConf.asInstanceOf[KubernetesPVCVolumeConf] === - KubernetesPVCVolumeConf("claimeName")) + KubernetesPVCVolumeConf("claimName")) } test("Parses emptyDir volumes correctly") { diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/MountVolumesFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/MountVolumesFeatureStepSuite.scala index 95ee37e3daa41..38f8fac1858f1 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/MountVolumesFeatureStepSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/MountVolumesFeatureStepSuite.scala @@ -42,7 +42,7 @@ class MountVolumesFeatureStepSuite extends SparkFunSuite { assert(configuredPod.container.getVolumeMounts.get(0).getReadOnly === false) } - test("Mounts pesistentVolumeClaims") { + test("Mounts persistentVolumeClaims") { val volumeConf = KubernetesVolumeSpec( "testVolume", "/tmp", diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala index bd42f6f05655f..5927af176062d 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala @@ -230,7 +230,7 @@ package object config { ConfigBuilder("spark.mesos.appJar.local.resolution.mode") .doc("Provides support for the `local:///` scheme to reference the app jar resource in " + "cluster mode. If user uses a local resource (`local:///path/to/jar`) and the config " + - "option is not used it defaults to `host` eg. the mesos fetcher tries to get the " + + "option is not used it defaults to `host` e.g. the mesos fetcher tries to get the " + "resource from the host's file system. If the value is unknown it prints a warning msg " + "in the dispatcher logs and defaults to `host`. If the value is `container` then spark " + "submit in the container will use the jar in the container's path: `/path/to/jar`.") diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala index 4620bdb005094..8dbb70b616df1 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala @@ -356,7 +356,7 @@ trait MesosSchedulerUtils extends Logging { * https://github.com/apache/mesos/blob/master/src/common/values.cpp * https://github.com/apache/mesos/blob/master/src/common/attributes.cpp * - * @param constraintsVal constains string consisting of ';' separated key-value pairs (separated + * @param constraintsVal contains string consisting of ';' separated key-value pairs (separated * by ':') * @return Map of constraints to match resources offers. */ diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index 57af76b46fe64..ac50c1c77a24e 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -781,7 +781,7 @@ private[yarn] class YarnAllocator( val (exitCausedByApp, containerExitReason) = exitStatus match { case ContainerExitStatus.SUCCESS => (false, s"Executor for container $containerId exited because of a YARN event (e.g., " + - "pre-emption) and not because of an error in the running job.") + "preemption) and not because of an error in the running job.") case ContainerExitStatus.PREEMPTED => // Preemption is not the fault of the running tasks, since YARN preempts containers // merely to do resource sharing, and tasks that fail due to preempted executors could diff --git a/resource-managers/yarn/src/test/java/org/apache/hadoop/net/ServerSocketUtil.java b/resource-managers/yarn/src/test/java/org/apache/hadoop/net/ServerSocketUtil.java index df0ebcc9871ac..89e012ecd42e1 100644 --- a/resource-managers/yarn/src/test/java/org/apache/hadoop/net/ServerSocketUtil.java +++ b/resource-managers/yarn/src/test/java/org/apache/hadoop/net/ServerSocketUtil.java @@ -112,7 +112,7 @@ public static int waitForPort(int port, int retries) * The ports are all closed afterwards, * so other network services started may grab those same ports. * - * @param numPorts number of required port nubmers + * @param numPorts number of required port numbers * @return array of available port numbers * @throws IOException */ diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala index c2bdd971a0fe9..188a48509212d 100644 --- a/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala +++ b/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala @@ -250,7 +250,7 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", resolver2) should be (Some(shuffleInfo2)) s2.stop() - // another stop & restart should be fine though (eg., we recover from previous corruption) + // another stop & restart should be fine though (e.g., we recover from previous corruption) s3 = new YarnShuffleService s3.setRecoveryPath(new Path(recoveryLocalDir.toURI)) s3.init(yarnConfig) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala index 570663c6f6ad3..7a8e3f1d2ccf4 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala @@ -163,7 +163,7 @@ private[python] object PythonTransformFunctionSerializer { private[streaming] object PythonDStream { /** - * can not access PythonTransformFunctionSerializer.register() via Py4j + * cannot access PythonTransformFunctionSerializer.register() via Py4j * Py4JError: PythonTransformFunctionSerializerregister does not exist in the JVM */ def registerSerializer(ser: PythonTransformFunctionSerializer): Unit = { diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala index e037f26088347..ca4f3670d5ad7 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala @@ -960,7 +960,7 @@ object DStream { /** Get the creation site of a DStream from the stack trace of when the DStream is created. */ private[streaming] def getCreationSite(): CallSite = { /** Filtering function that excludes non-user classes for a streaming application */ - def streamingExclustionFunction(className: String): Boolean = { + def streamingExclusionFunction(className: String): Boolean = { def doesMatch(r: Regex): Boolean = r.findFirstIn(className).isDefined val isSparkClass = doesMatch(SPARK_CLASS_REGEX) val isSparkExampleClass = doesMatch(SPARK_EXAMPLES_CLASS_REGEX) @@ -972,6 +972,6 @@ object DStream { // non-Spark and non-Scala class, as the rest would streaming application classes. (isSparkClass || isScalaClass) && !isSparkExampleClass && !isSparkStreamingTestClass } - org.apache.spark.util.Utils.getCallSite(streamingExclustionFunction) + org.apache.spark.util.Utils.getCallSite(streamingExclusionFunction) } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala index 006bcad5d68c2..ef040681adf37 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala @@ -39,7 +39,7 @@ private[streaming] object HdfsUtils { throw new IllegalStateException("File exists and there is no append support!") } } else { - // we dont' want to use hdfs erasure coding, as that lacks support for append and hflush + // we don't want to use hdfs erasure coding, as that lacks support for append and hflush SparkHadoopUtil.createFile(dfs, dfsPath, false) } } diff --git a/streaming/src/test/java/test/org/apache/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/test/org/apache/spark/streaming/JavaAPISuite.java index c7cde5674f547..8a57b0c58b228 100644 --- a/streaming/src/test/java/test/org/apache/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/test/org/apache/spark/streaming/JavaAPISuite.java @@ -1595,7 +1595,7 @@ public void testContextGetOrCreate() throws InterruptedException { /* TEST DISABLED: Pending a discussion about checkpoint() semantics with TD @SuppressWarnings("unchecked") @Test - public void testCheckpointofIndividualStream() throws InterruptedException { + public void testCheckpointOfIndividualStream() throws InterruptedException { List> inputData = Arrays.asList( Arrays.asList("this", "is"), Arrays.asList("a", "test"), diff --git a/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala index b2b8d2f41fc80..3ffaa62bd75ac 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala @@ -541,12 +541,12 @@ class MapWithStateSuite extends SparkFunSuite with LocalStreamingContext // Setup the stream computation val ssc = new StreamingContext(sc, Seconds(1)) val inputStream = new TestInputStream(ssc, input, numPartitions = 2) - val trackeStateStream = inputStream.map(x => (x, 1)).mapWithState(mapWithStateSpec) + val trackedStateStream = inputStream.map(x => (x, 1)).mapWithState(mapWithStateSpec) val collectedOutputs = new ConcurrentLinkedQueue[Seq[T]] - val outputStream = new TestOutputStream(trackeStateStream, collectedOutputs) + val outputStream = new TestOutputStream(trackedStateStream, collectedOutputs) val collectedStateSnapshots = new ConcurrentLinkedQueue[Seq[(K, S)]] val stateSnapshotStream = new TestOutputStream( - trackeStateStream.stateSnapshots(), collectedStateSnapshots) + trackedStateStream.stateSnapshots(), collectedStateSnapshots) outputStream.register() stateSnapshotStream.register() diff --git a/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala index 58ce3a93251a9..f06b1feb8c0cd 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/rdd/MapWithStateRDDSuite.scala @@ -320,7 +320,7 @@ class MapWithStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with B makeStateRDDWithLongLineageDataRDD, reliableCheckpoint = true, rddCollectFunc _) /** Generate MapWithStateRDD with parent state RDD having a long lineage */ - def makeStateRDDWithLongLineageParenttateRDD( + def makeStateRDDWithLongLineageParentStateRDD( longLineageRDD: RDD[Int]): MapWithStateRDD[Int, Int, Int, Int] = { // Create a MapWithStateRDD that has a long lineage using the data RDD with a long lineage @@ -337,9 +337,9 @@ class MapWithStateRDDSuite extends SparkFunSuite with RDDCheckpointTester with B } testRDD( - makeStateRDDWithLongLineageParenttateRDD, reliableCheckpoint = true, rddCollectFunc _) + makeStateRDDWithLongLineageParentStateRDD, reliableCheckpoint = true, rddCollectFunc _) testRDDPartitions( - makeStateRDDWithLongLineageParenttateRDD, reliableCheckpoint = true, rddCollectFunc _) + makeStateRDDWithLongLineageParentStateRDD, reliableCheckpoint = true, rddCollectFunc _) } test("checkpointing empty state RDD") { From cf98a761de677c733f3c33230e1c63ddb785d5c5 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Sat, 28 Nov 2020 23:38:11 +0900 Subject: [PATCH 030/150] [SPARK-33570][SQL][TESTS] Set the proper version of gssapi plugin automatically for MariaDBKrbIntegrationSuite ### What changes were proposed in this pull request? This PR changes mariadb_docker_entrypoint.sh to set the proper version automatically for mariadb-plugin-gssapi-server. The proper version is based on the one of mariadb-server. Also, this PR enables to use arbitrary docker image by setting the environment variable `MARIADB_CONTAINER_IMAGE_NAME`. ### Why are the changes needed? For `MariaDBKrbIntegrationSuite`, the version of `mariadb-plugin-gssapi-server` is currently set to `10.5.5` in `mariadb_docker_entrypoint.sh` but it's no longer available in the official apt repository and `MariaDBKrbIntegrationSuite` doesn't pass for now. It seems that only the most recent three versions are available for each major version and they are `10.5.6`, `10.5.7` and `10.5.8` for now. Further, the release cycle of MariaDB seems to be very rapid (1 ~ 2 months) so I don't think it's a good idea to set to an specific version for `mariadb-plugin-gssapi-server`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Confirmed that `MariaDBKrbIntegrationSuite` passes with the following commands. ``` $ build/sbt -Pdocker-integration-tests -Phive -Phive-thriftserver package "testOnly org.apache.spark.sql.jdbc.MariaDBKrbIntegrationSuite" ``` In this case, we can see what version of `mariadb-plugin-gssapi-server` is going to be installed in the following container log message. ``` Installing mariadb-plugin-gssapi-server=1:10.5.8+maria~focal ``` Or, we can set MARIADB_CONTAINER_IMAGE_NAME for a specific version of MariaDB. ``` $ MARIADB_DOCKER_IMAGE_NAME=mariadb:10.5.6 build/sbt -Pdocker-integration-tests -Phive -Phive-thriftserver package "testOnly org.apache.spark.sql.jdbc.MariaDBKrbIntegrationSuite" ``` ``` Installing mariadb-plugin-gssapi-server=1:10.5.6+maria~focal ``` Closes #30515 from sarutak/fix-MariaDBKrbIntegrationSuite. Authored-by: Kousuke Saruta Signed-off-by: Takeshi Yamamuro --- .../src/test/resources/mariadb_docker_entrypoint.sh | 4 +++- .../spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala | 12 +++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/external/docker-integration-tests/src/test/resources/mariadb_docker_entrypoint.sh b/external/docker-integration-tests/src/test/resources/mariadb_docker_entrypoint.sh index 97c00a9d81b76..ab7d967a927d0 100755 --- a/external/docker-integration-tests/src/test/resources/mariadb_docker_entrypoint.sh +++ b/external/docker-integration-tests/src/test/resources/mariadb_docker_entrypoint.sh @@ -18,7 +18,9 @@ dpkg-divert --add /bin/systemctl && ln -sT /bin/true /bin/systemctl apt update -apt install -y mariadb-plugin-gssapi-server=1:10.5.5+maria~focal +GSSAPI_PLUGIN=mariadb-plugin-gssapi-server=$(dpkg -s mariadb-server | sed -n "s/^Version: \(.*\)/\1/p") +echo "Installing $GSSAPI_PLUGIN" +apt install -y "$GSSAPI_PLUGIN" echo "gssapi_keytab_path=/docker-entrypoint-initdb.d/mariadb.keytab" >> /etc/mysql/mariadb.conf.d/auth_gssapi.cnf echo "gssapi_principal_name=mariadb/__IP_ADDRESS_REPLACE_ME__@EXAMPLE.COM" >> /etc/mysql/mariadb.conf.d/auth_gssapi.cnf docker-entrypoint.sh mysqld diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala index adee2bebe41ce..59a6f530afd7e 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala +++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala @@ -24,15 +24,21 @@ import com.spotify.docker.client.messages.{ContainerConfig, HostConfig} import org.apache.spark.sql.execution.datasources.jdbc.connection.SecureConnectionProvider import org.apache.spark.tags.DockerTest +/** + * To run this test suite for a specific version (e.g., mariadb:10.5.8): + * {{{ + * MARIADB_DOCKER_IMAGE_NAME=mariadb:10.5.8 + * ./build/sbt -Pdocker-integration-tests + * "testOnly org.apache.spark.sql.jdbc.MariaDBKrbIntegrationSuite" + * }}} + */ @DockerTest class MariaDBKrbIntegrationSuite extends DockerKrbJDBCIntegrationSuite { override protected val userName = s"mariadb/$dockerIp" override protected val keytabFileName = "mariadb.keytab" override val db = new DatabaseOnDocker { - // If you change `imageName`, you need to update the version of `mariadb-plugin-gssapi-server` - // in `resources/mariadb_docker_entrypoint.sh` accordingly. - override val imageName = "mariadb:10.5" + override val imageName = sys.env.getOrElse("MARIADB_DOCKER_IMAGE_NAME", "mariadb:10.5") override val env = Map( "MYSQL_ROOT_PASSWORD" -> "rootpass" ) From 3650a6bd97b9cecf382f96a55a97ff56b75471cd Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 28 Nov 2020 12:47:47 -0800 Subject: [PATCH 031/150] [SPARK-33580][CORE] resolveDependencyPaths should use classifier attribute of artifact ### What changes were proposed in this pull request? This patch proposes to use classifier attribute to construct artifact path instead of type. ### Why are the changes needed? `resolveDependencyPaths` now takes artifact type to decide to add "-tests" postfix. However, the path pattern of ivy in `resolveMavenCoordinates` is `[organization]_[artifact][revision](-[classifier]).[ext]`. We should use classifier instead of type to construct file path. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Unit test. Manual test. Closes #30524 from viirya/SPARK-33580. Authored-by: Liang-Chi Hsieh Signed-off-by: Dongjoon Hyun --- .../org/apache/spark/deploy/SparkSubmit.scala | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 4b17661496808..7332c6d54c981 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -1186,12 +1186,16 @@ private[spark] object SparkSubmitUtils { def resolveDependencyPaths( artifacts: Array[AnyRef], cacheDirectory: File): String = { - artifacts.map { ai => - val artifactInfo = ai.asInstanceOf[Artifact] - val artifact = artifactInfo.getModuleRevisionId - val testSuffix = if (artifactInfo.getType == "test-jar") "-tests" else "" + artifacts.map { artifactInfo => + val artifact = artifactInfo.asInstanceOf[Artifact].getModuleRevisionId + val extraAttrs = artifactInfo.asInstanceOf[Artifact].getExtraAttributes + val classifier = if (extraAttrs.containsKey("classifier")) { + "-" + extraAttrs.get("classifier") + } else { + "" + } cacheDirectory.getAbsolutePath + File.separator + - s"${artifact.getOrganisation}_${artifact.getName}-${artifact.getRevision}${testSuffix}.jar" + s"${artifact.getOrganisation}_${artifact.getName}-${artifact.getRevision}$classifier.jar" }.mkString(",") } From bfe9380ba2bc9762ccfaa36d3ed938867c143876 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Sat, 28 Nov 2020 16:58:40 -0800 Subject: [PATCH 032/150] [MINOR][SQL] Remove `getTables()` from `r.SQLUtils` ### What changes were proposed in this pull request? Remove the unused method `getTables()` from `r.SQLUtils`. The method was used before the changes https://github.com/apache/spark/pull/17483 but R's `tables.default` was rewritten using `listTables()`: https://github.com/apache/spark/pull/17483/files#diff-2c01472a7bcb1d318244afcd621d726e00d36cd15dffe7e44fa96c54fce4cd9aR220-R223 ### Why are the changes needed? To improve code maintenance, and remove the dead code. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? By R tests. Closes #30527 from MaxGekk/remove-getTables-in-r-SQLUtils. Authored-by: Max Gekk Signed-off-by: Dongjoon Hyun --- .../scala/org/apache/spark/sql/api/r/SQLUtils.scala | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala index 693be99d47495..1d1358487abcb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala @@ -33,7 +33,6 @@ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.expressions.{ExprUtils, GenericRowWithSchema} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.execution.arrow.ArrowConverters -import org.apache.spark.sql.execution.command.ShowTablesCommand import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION import org.apache.spark.sql.types._ @@ -216,15 +215,6 @@ private[sql] object SQLUtils extends Logging { } } - def getTables(sparkSession: SparkSession, databaseName: String): DataFrame = { - databaseName match { - case n: String if n != null && n.trim.nonEmpty => - Dataset.ofRows(sparkSession, ShowTablesCommand(Some(n), None)) - case _ => - Dataset.ofRows(sparkSession, ShowTablesCommand(None, None)) - } - } - def getTableNames(sparkSession: SparkSession, databaseName: String): Array[String] = { val db = databaseName match { case _ if databaseName != null && databaseName.trim.nonEmpty => From ba178f852f8e4b11a243d907ac204b30a60369b5 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Sun, 29 Nov 2020 09:36:55 +0800 Subject: [PATCH 033/150] [SPARK-33581][SQL][TEST] Refactor HivePartitionFilteringSuite ### What changes were proposed in this pull request? This pr refactor HivePartitionFilteringSuite. ### Why are the changes needed? To make it easy to maintain. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? N/A Closes #30525 from wangyum/SPARK-33581. Authored-by: Yuming Wang Signed-off-by: Yuming Wang --- .../client/HivePartitionFilteringSuite.scala | 291 +++++++++++------- 1 file changed, 177 insertions(+), 114 deletions(-) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala index ab83f751f1425..e07fbc29ee8aa 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala @@ -39,7 +39,13 @@ class HivePartitionFilteringSuite(version: String) private val tryDirectSqlKey = HiveConf.ConfVars.METASTORE_TRY_DIRECT_SQL.varname - private val testPartitionCount = 3 * 5 * 4 + private val dsValue = 20170101 to 20170103 + private val hValue = 0 to 4 + private val chunkValue = Seq("aa", "ab", "ba", "bb") + private val dateValue = Seq("2019-01-01", "2019-01-02", "2019-01-03") + private val dateStrValue = Seq("2020-01-01", "2020-01-02", "2020-01-03") + private val testPartitionCount = + dsValue.size * hValue.size * chunkValue.size * dateValue.size * dateStrValue.size private val storageFormat = CatalogStorageFormat( locationUri = None, @@ -57,23 +63,28 @@ class HivePartitionFilteringSuite(version: String) val client = buildClient(hadoopConf) val tableSchema = new StructType().add("value", "int").add("ds", "int").add("h", "int").add("chunk", "string") + .add("d", "date").add("datestr", "string") val table = CatalogTable( identifier = TableIdentifier("test", Some("default")), tableType = CatalogTableType.MANAGED, schema = tableSchema, - partitionColumnNames = Seq("ds", "h", "chunk"), + partitionColumnNames = Seq("ds", "h", "chunk", "d", "datestr"), storage = storageFormat) client.createTable(table, ignoreIfExists = false) val partitions = for { - ds <- 20170101 to 20170103 - h <- 0 to 4 - chunk <- Seq("aa", "ab", "ba", "bb") + ds <- dsValue + h <- hValue + chunk <- chunkValue + date <- dateValue + dateStr <- dateStrValue } yield CatalogTablePartition(Map( "ds" -> ds.toString, "h" -> h.toString, - "chunk" -> chunk + "chunk" -> chunk, + "d" -> date, + "datestr" -> dateStr ), storageFormat) assert(partitions.size == testPartitionCount) @@ -108,17 +119,21 @@ class HivePartitionFilteringSuite(version: String) // Should return all partitions where <=> is not supported testMetastorePartitionFiltering( attr("ds") <=> 20170101, - 20170101 to 20170103, - 0 to 4, - "aa" :: "ab" :: "ba" :: "bb" :: Nil) + dsValue, + hValue, + chunkValue, + dateValue, + dateStrValue) } test("getPartitionsByFilter: ds=20170101") { testMetastorePartitionFiltering( attr("ds") === 20170101, 20170101 to 20170101, - 0 to 4, - "aa" :: "ab" :: "ba" :: "bb" :: Nil) + hValue, + chunkValue, + dateValue, + dateStrValue) } test("getPartitionsByFilter: ds=(20170101 + 1) and h=0") { @@ -126,41 +141,51 @@ class HivePartitionFilteringSuite(version: String) // comparisons to non-literal values testMetastorePartitionFiltering( attr("ds") === (Literal(20170101) + 1) && attr("h") === 0, - 20170101 to 20170103, + dsValue, 0 to 0, - "aa" :: "ab" :: "ba" :: "bb" :: Nil) + chunkValue, + dateValue, + dateStrValue) } test("getPartitionsByFilter: chunk='aa'") { testMetastorePartitionFiltering( attr("chunk") === "aa", - 20170101 to 20170103, - 0 to 4, - "aa" :: Nil) + dsValue, + hValue, + "aa" :: Nil, + dateValue, + dateStrValue) } test("getPartitionsByFilter: cast(chunk as int)=1 (not a valid partition predicate)") { testMetastorePartitionFiltering( attr("chunk").cast(IntegerType) === 1, - 20170101 to 20170103, - 0 to 4, - "aa" :: "ab" :: "ba" :: "bb" :: Nil) + dsValue, + hValue, + chunkValue, + dateValue, + dateStrValue) } test("getPartitionsByFilter: cast(chunk as boolean)=true (not a valid partition predicate)") { testMetastorePartitionFiltering( attr("chunk").cast(BooleanType) === true, - 20170101 to 20170103, - 0 to 4, - "aa" :: "ab" :: "ba" :: "bb" :: Nil) + dsValue, + hValue, + chunkValue, + dateValue, + dateStrValue) } test("getPartitionsByFilter: 20170101=ds") { testMetastorePartitionFiltering( Literal(20170101) === attr("ds"), 20170101 to 20170101, - 0 to 4, - "aa" :: "ab" :: "ba" :: "bb" :: Nil) + hValue, + chunkValue, + dateValue, + dateStrValue) } test("getPartitionsByFilter: ds=20170101 and h=2") { @@ -168,7 +193,9 @@ class HivePartitionFilteringSuite(version: String) attr("ds") === 20170101 && attr("h") === 2, 20170101 to 20170101, 2 to 2, - "aa" :: "ab" :: "ba" :: "bb" :: Nil) + chunkValue, + dateValue, + dateStrValue) } test("getPartitionsByFilter: cast(ds as long)=20170101L and h=2") { @@ -176,39 +203,49 @@ class HivePartitionFilteringSuite(version: String) attr("ds").cast(LongType) === 20170101L && attr("h") === 2, 20170101 to 20170101, 2 to 2, - "aa" :: "ab" :: "ba" :: "bb" :: Nil) + chunkValue, + dateValue, + dateStrValue) } test("getPartitionsByFilter: ds=20170101 or ds=20170102") { testMetastorePartitionFiltering( attr("ds") === 20170101 || attr("ds") === 20170102, 20170101 to 20170102, - 0 to 4, - "aa" :: "ab" :: "ba" :: "bb" :: Nil) + hValue, + chunkValue, + dateValue, + dateStrValue) } test("getPartitionsByFilter: ds in (20170102, 20170103) (using IN expression)") { testMetastorePartitionFiltering( attr("ds").in(20170102, 20170103), 20170102 to 20170103, - 0 to 4, - "aa" :: "ab" :: "ba" :: "bb" :: Nil) + hValue, + chunkValue, + dateValue, + dateStrValue) } test("getPartitionsByFilter: cast(ds as long) in (20170102L, 20170103L) (using IN expression)") { testMetastorePartitionFiltering( attr("ds").cast(LongType).in(20170102L, 20170103L), 20170102 to 20170103, - 0 to 4, - "aa" :: "ab" :: "ba" :: "bb" :: Nil) + hValue, + chunkValue, + dateValue, + dateStrValue) } test("getPartitionsByFilter: ds in (20170102, 20170103) (using INSET expression)") { testMetastorePartitionFiltering( attr("ds").in(20170102, 20170103), 20170102 to 20170103, - 0 to 4, - "aa" :: "ab" :: "ba" :: "bb" :: Nil, { + hValue, + chunkValue, + dateValue, + dateStrValue, { case expr @ In(v, list) if expr.inSetConvertible => InSet(v, list.map(_.eval(EmptyRow)).toSet) }) @@ -219,8 +256,10 @@ class HivePartitionFilteringSuite(version: String) testMetastorePartitionFiltering( attr("ds").cast(LongType).in(20170102L, 20170103L), 20170102 to 20170103, - 0 to 4, - "aa" :: "ab" :: "ba" :: "bb" :: Nil, { + hValue, + chunkValue, + dateValue, + dateStrValue, { case expr @ In(v, list) if expr.inSetConvertible => InSet(v, list.map(_.eval(EmptyRow)).toSet) }) @@ -229,41 +268,45 @@ class HivePartitionFilteringSuite(version: String) test("getPartitionsByFilter: chunk in ('ab', 'ba') (using IN expression)") { testMetastorePartitionFiltering( attr("chunk").in("ab", "ba"), - 20170101 to 20170103, - 0 to 4, - "ab" :: "ba" :: Nil) + dsValue, + hValue, + "ab" :: "ba" :: Nil, + dateValue, + dateStrValue) } test("getPartitionsByFilter: chunk in ('ab', 'ba') (using INSET expression)") { testMetastorePartitionFiltering( attr("chunk").in("ab", "ba"), - 20170101 to 20170103, - 0 to 4, - "ab" :: "ba" :: Nil, { + dsValue, + hValue, + "ab" :: "ba" :: Nil, + dateValue, + dateStrValue, { case expr @ In(v, list) if expr.inSetConvertible => InSet(v, list.map(_.eval(EmptyRow)).toSet) }) } test("getPartitionsByFilter: (ds=20170101 and h>=2) or (ds=20170102 and h<2)") { - val day1 = (20170101 to 20170101, 2 to 4, Seq("aa", "ab", "ba", "bb")) - val day2 = (20170102 to 20170102, 0 to 1, Seq("aa", "ab", "ba", "bb")) + val day1 = (20170101 to 20170101, 2 to 4, chunkValue, dateValue, dateStrValue) + val day2 = (20170102 to 20170102, 0 to 1, chunkValue, dateValue, dateStrValue) testMetastorePartitionFiltering((attr("ds") === 20170101 && attr("h") >= 2) || (attr("ds") === 20170102 && attr("h") < 2), day1 :: day2 :: Nil) } test("getPartitionsByFilter: (ds=20170101 and h>=2) or (ds=20170102 and h<(1+1))") { - val day1 = (20170101 to 20170101, 2 to 4, Seq("aa", "ab", "ba", "bb")) + val day1 = (20170101 to 20170101, 2 to 4, chunkValue, dateValue, dateStrValue) // Day 2 should include all hours because we can't build a filter for h<(7+1) - val day2 = (20170102 to 20170102, 0 to 4, Seq("aa", "ab", "ba", "bb")) + val day2 = (20170102 to 20170102, 0 to 4, chunkValue, dateValue, dateStrValue) testMetastorePartitionFiltering((attr("ds") === 20170101 && attr("h") >= 2) || (attr("ds") === 20170102 && attr("h") < (Literal(1) + 1)), day1 :: day2 :: Nil) } test("getPartitionsByFilter: " + "chunk in ('ab', 'ba') and ((ds=20170101 and h>=2) or (ds=20170102 and h<2))") { - val day1 = (20170101 to 20170101, 2 to 4, Seq("ab", "ba")) - val day2 = (20170102 to 20170102, 0 to 1, Seq("ab", "ba")) + val day1 = (20170101 to 20170101, 2 to 4, Seq("ab", "ba"), dateValue, dateStrValue) + val day2 = (20170102 to 20170102, 0 to 1, Seq("ab", "ba"), dateValue, dateStrValue) testMetastorePartitionFiltering(attr("chunk").in("ab", "ba") && ((attr("ds") === 20170101 && attr("h") >= 2) || (attr("ds") === 20170102 && attr("h") < 2)), day1 :: day2 :: Nil) @@ -272,93 +315,105 @@ class HivePartitionFilteringSuite(version: String) test("getPartitionsByFilter: chunk contains bb") { testMetastorePartitionFiltering( attr("chunk").contains("bb"), - (20170101 to 20170103, 0 to 4, Seq("bb")) :: Nil) + dsValue, + hValue, + Seq("bb"), + dateValue, + dateStrValue) } test("getPartitionsByFilter: chunk startsWith b") { testMetastorePartitionFiltering( attr("chunk").startsWith("b"), - (20170101 to 20170103, 0 to 4, Seq("ba", "bb")) :: Nil) + dsValue, + hValue, + Seq("ba", "bb"), + dateValue, + dateStrValue) } test("getPartitionsByFilter: chunk endsWith b") { testMetastorePartitionFiltering( attr("chunk").endsWith("b"), - (20170101 to 20170103, 0 to 4, Seq("ab", "bb")) :: Nil) + dsValue, + hValue, + Seq("ab", "bb"), + dateValue, + dateStrValue) } test("getPartitionsByFilter: chunk in ('ab', 'ba') and ((cast(ds as string)>'20170102')") { - val day = (20170101 to 20170103, 0 to 4, Seq("ab", "ba")) testMetastorePartitionFiltering( attr("chunk").in("ab", "ba") && (attr("ds").cast(StringType) > "20170102"), - day :: Nil) + dsValue, + hValue, + Seq("ab", "ba"), + dateValue, + dateStrValue) } - test("getPartitionsByFilter: date type pruning by metastore") { - val table = CatalogTable( - identifier = TableIdentifier("test_date", Some("default")), - tableType = CatalogTableType.MANAGED, - schema = new StructType().add("value", "int").add("part", "date"), - partitionColumnNames = Seq("part"), - storage = storageFormat) - client.createTable(table, ignoreIfExists = false) + test("getPartitionsByFilter: d=2019-01-01") { + testMetastorePartitionFiltering( + attr("d") === Date.valueOf("2019-01-01"), + dsValue, + hValue, + chunkValue, + Seq("2019-01-01"), + dateStrValue) + } - val partitions = - for { - date <- Seq("2019-01-01", "2019-01-02", "2019-01-03", "2019-01-04") - } yield CatalogTablePartition(Map( - "part" -> date - ), storageFormat) - assert(partitions.size == 4) - - client.createPartitions("default", "test_date", partitions, ignoreIfExists = false) - - def testDataTypeFiltering( - filterExprs: Seq[Expression], - expectedPartitionCubes: Seq[Seq[Date]]): Unit = { - val filteredPartitions = client.getPartitionsByFilter( - client.getTable("default", "test_date"), - filterExprs, - SQLConf.get.sessionLocalTimeZone) - - val expectedPartitions = expectedPartitionCubes.map { - expectedDt => - for { - dt <- expectedDt - } yield Set( - "part" -> dt.toString - ) - }.reduce(_ ++ _) - - assert(filteredPartitions.map(_.spec.toSet).toSet == expectedPartitions.toSet) - } + test("getPartitionsByFilter: d>2019-01-02") { + testMetastorePartitionFiltering( + attr("d") > Date.valueOf("2019-01-02"), + dsValue, + hValue, + chunkValue, + Seq("2019-01-03"), + dateStrValue) + } + + test("getPartitionsByFilter: In(d, 2019-01-01, 2019-01-02)") { + testMetastorePartitionFiltering( + In(attr("d"), + Seq("2019-01-01", "2019-01-02").map(d => Literal(Date.valueOf(d)))), + dsValue, + hValue, + chunkValue, + Seq("2019-01-01", "2019-01-02"), + dateStrValue) + } - val dateAttr: Attribute = AttributeReference("part", DateType)() + test("getPartitionsByFilter: InSet(d, 2019-01-01, 2019-01-02)") { + testMetastorePartitionFiltering( + InSet(attr("d"), + Set("2019-01-01", "2019-01-02").map(d => Literal(Date.valueOf(d)).eval(EmptyRow))), + dsValue, + hValue, + chunkValue, + Seq("2019-01-01", "2019-01-02"), + dateStrValue) + } - testDataTypeFiltering( - Seq(dateAttr === Date.valueOf("2019-01-01")), - Seq("2019-01-01").map(Date.valueOf) :: Nil) - testDataTypeFiltering( - Seq(dateAttr > Date.valueOf("2019-01-02")), - Seq("2019-01-03", "2019-01-04").map(Date.valueOf) :: Nil) - testDataTypeFiltering( - Seq(In(dateAttr, - Seq("2019-01-01", "2019-01-02").map(d => Literal(Date.valueOf(d))))), - Seq("2019-01-01", "2019-01-02").map(Date.valueOf) :: Nil) - testDataTypeFiltering( - Seq(InSet(dateAttr, - Set("2019-01-01", "2019-01-02").map(d => Literal(Date.valueOf(d)).eval(EmptyRow)))), - Seq("2019-01-01", "2019-01-02").map(Date.valueOf) :: Nil) + test("getPartitionsByFilter: cast(datestr as date)= 2020-01-01") { + testMetastorePartitionFiltering( + attr("datestr").cast(DateType) === Date.valueOf("2020-01-01"), + dsValue, + hValue, + chunkValue, + dateValue, + dateStrValue) } private def testMetastorePartitionFiltering( filterExpr: Expression, expectedDs: Seq[Int], expectedH: Seq[Int], - expectedChunks: Seq[String]): Unit = { + expectedChunks: Seq[String], + expectedD: Seq[String], + expectedDatestr: Seq[String]): Unit = { testMetastorePartitionFiltering( filterExpr, - (expectedDs, expectedH, expectedChunks) :: Nil, + (expectedDs, expectedH, expectedChunks, expectedD, expectedDatestr) :: Nil, identity) } @@ -367,22 +422,25 @@ class HivePartitionFilteringSuite(version: String) expectedDs: Seq[Int], expectedH: Seq[Int], expectedChunks: Seq[String], + expectedD: Seq[String], + expectedDatestr: Seq[String], transform: Expression => Expression): Unit = { testMetastorePartitionFiltering( filterExpr, - (expectedDs, expectedH, expectedChunks) :: Nil, + (expectedDs, expectedH, expectedChunks, expectedD, expectedDatestr) :: Nil, transform) } private def testMetastorePartitionFiltering( filterExpr: Expression, - expectedPartitionCubes: Seq[(Seq[Int], Seq[Int], Seq[String])]): Unit = { + expectedPartitionCubes: + Seq[(Seq[Int], Seq[Int], Seq[String], Seq[String], Seq[String])]): Unit = { testMetastorePartitionFiltering(filterExpr, expectedPartitionCubes, identity) } private def testMetastorePartitionFiltering( filterExpr: Expression, - expectedPartitionCubes: Seq[(Seq[Int], Seq[Int], Seq[String])], + expectedPartitionCubes: Seq[(Seq[Int], Seq[Int], Seq[String], Seq[String], Seq[String])], transform: Expression => Expression): Unit = { val filteredPartitions = client.getPartitionsByFilter(client.getTable("default", "test"), Seq( @@ -390,20 +448,25 @@ class HivePartitionFilteringSuite(version: String) ), SQLConf.get.sessionLocalTimeZone) val expectedPartitionCount = expectedPartitionCubes.map { - case (expectedDs, expectedH, expectedChunks) => - expectedDs.size * expectedH.size * expectedChunks.size + case (expectedDs, expectedH, expectedChunks, expectedD, expectedDatestr) => + expectedDs.size * expectedH.size * expectedChunks.size * + expectedD.size * expectedDatestr.size }.sum val expectedPartitions = expectedPartitionCubes.map { - case (expectedDs, expectedH, expectedChunks) => + case (expectedDs, expectedH, expectedChunks, expectedD, expectedDatestr) => for { ds <- expectedDs h <- expectedH chunk <- expectedChunks + d <- expectedD + datestr <- expectedDatestr } yield Set( "ds" -> ds.toString, "h" -> h.toString, - "chunk" -> chunk + "chunk" -> chunk, + "d" -> d, + "datestr" -> datestr ) }.reduce(_ ++ _) From b94ff1e870152ac692c6f1ebf3d110caa274ebb2 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 29 Nov 2020 11:24:58 -0800 Subject: [PATCH 034/150] [SPARK-33590][DOCS][SQL] Add missing sub-bullets in Spark SQL Guide ### What changes were proposed in this pull request? Add the missing sub-bullets in the left side of `Spark SQL Guide` ### Why are the changes needed? The three sub-bullets in the left side is not consistent with the contents (five bullets) in the right side. ![image](https://user-images.githubusercontent.com/1315079/100546388-7a21e880-32a4-11eb-922d-62a52f4f9f9b.png) ### Does this PR introduce _any_ user-facing change? Yes, you can see more lines in the left menu. ### How was this patch tested? Manually build the doc as follows. This can be verified as attached: ``` cd docs SKIP_API=1 jekyll build firefox _site/sql-pyspark-pandas-with-arrow.html ``` ![image](https://user-images.githubusercontent.com/1315079/100546399-8ad25e80-32a4-11eb-80ac-44af0aebc717.png) Closes #30537 from kiszk/SPARK-33590. Authored-by: Kazuaki Ishizaki Signed-off-by: Dongjoon Hyun --- docs/_data/menu-sql.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/_data/menu-sql.yaml b/docs/_data/menu-sql.yaml index 2207bd6a17656..ec0b404fe672f 100644 --- a/docs/_data/menu-sql.yaml +++ b/docs/_data/menu-sql.yaml @@ -51,6 +51,10 @@ url: sql-performance-tuning.html#other-configuration-options - text: Join Strategy Hints for SQL Queries url: sql-performance-tuning.html#join-strategy-hints-for-sql-queries + - text: Coalesce Hints for SQL Queries + url: sql-performance-tuning.html#coalesce-hints-for-sql-queries + - text: Adaptive Query Execution + url: sql-performance-tuning.html#adaptive-query-execution - text: Distributed SQL Engine url: sql-distributed-sql-engine.html subitems: From c8286ec41616909f1f6e452ce63f0e7605d5bc63 Mon Sep 17 00:00:00 2001 From: Shixiong Zhu Date: Sun, 29 Nov 2020 11:56:48 -0800 Subject: [PATCH 035/150] [SPARK-33587][CORE] Kill the executor on nested fatal errors ### What changes were proposed in this pull request? Currently we will kill the executor when hitting a fatal error. However, if the fatal error is wrapped by another exception, such as - java.util.concurrent.ExecutionException, com.google.common.util.concurrent.UncheckedExecutionException, com.google.common.util.concurrent.ExecutionError when using Guava cache or Java thread pool. - SparkException thrown from https://github.com/apache/spark/blob/cf98a761de677c733f3c33230e1c63ddb785d5c5/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala#L231 or https://github.com/apache/spark/blob/cf98a761de677c733f3c33230e1c63ddb785d5c5/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala#L296 We will still keep the executor running. Fatal errors are usually unrecoverable (such as OutOfMemoryError), some components may be in a broken state when hitting a fatal error and it's hard to predicate the behaviors of a broken component. Hence, it's better to detect the nested fatal error as well and kill the executor. Then we can rely on Spark's fault tolerance to recover. ### Why are the changes needed? Fatal errors are usually unrecoverable (such as OutOfMemoryError), some components may be in a broken state when hitting a fatal error and it's hard to predicate the behaviors of a broken component. Hence, it's better to detect the nested fatal error as well and kill the executor. Then we can rely on Spark's fault tolerance to recover. ### Does this PR introduce _any_ user-facing change? Yep. There is a slight internal behavior change on when to kill an executor. We will kill the executor when detecting a nested fatal error in the exception chain. `spark.executor.killOnFatalError.depth` is added to allow users to turn off this change if the slight behavior change impacts them. ### How was this patch tested? The new method `Executor.isFatalError` is tested by `spark.executor.killOnNestedFatalError`. Closes #30528 from zsxwing/SPARK-33587. Authored-by: Shixiong Zhu Signed-off-by: Dongjoon Hyun --- .../org/apache/spark/executor/Executor.scala | 28 ++++++- .../spark/internal/config/package.scala | 11 +++ .../apache/spark/executor/ExecutorSuite.scala | 73 ++++++++++++++++++- 3 files changed, 108 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index f7246448959e9..efb0b2c26d9a9 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -150,6 +150,8 @@ private[spark] class Executor( // Whether to monitor killed / interrupted tasks private val taskReaperEnabled = conf.get(TASK_REAPER_ENABLED) + private val killOnFatalErrorDepth = conf.get(EXECUTOR_KILL_ON_FATAL_ERROR_DEPTH) + // Create our ClassLoader // do this after SparkEnv creation so can access the SecurityManager private val urlClassLoader = createClassLoader() @@ -648,7 +650,7 @@ private[spark] class Executor( plugins.foreach(_.onTaskFailed(reason)) execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(reason)) - case t: Throwable if hasFetchFailure && !Utils.isFatalError(t) => + case t: Throwable if hasFetchFailure && !Executor.isFatalError(t, killOnFatalErrorDepth) => val reason = task.context.fetchFailed.get.toTaskFailedReason if (!t.isInstanceOf[FetchFailedException]) { // there was a fetch failure in the task, but some user code wrapped that exception @@ -711,7 +713,7 @@ private[spark] class Executor( // Don't forcibly exit unless the exception was inherently fatal, to avoid // stopping other tasks unnecessarily. - if (!t.isInstanceOf[SparkOutOfMemoryError] && Utils.isFatalError(t)) { + if (Executor.isFatalError(t, killOnFatalErrorDepth)) { uncaughtExceptionHandler.uncaughtException(Thread.currentThread(), t) } } finally { @@ -997,4 +999,26 @@ private[spark] object Executor { // Used to store executorSource, for local mode only var executorSourceLocalModeOnly: ExecutorSource = null + + /** + * Whether a `Throwable` thrown from a task is a fatal error. We will use this to decide whether + * to kill the executor. + * + * @param depthToCheck The max depth of the exception chain we should search for a fatal error. 0 + * means not checking any fatal error (in other words, return false), 1 means + * checking only the exception but not the cause, and so on. This is to avoid + * `StackOverflowError` when hitting a cycle in the exception chain. + */ + def isFatalError(t: Throwable, depthToCheck: Int): Boolean = { + if (depthToCheck <= 0) { + false + } else { + t match { + case _: SparkOutOfMemoryError => false + case e if Utils.isFatalError(e) => true + case e if e.getCause != null => isFatalError(e.getCause, depthToCheck - 1) + case _ => false + } + } + } } diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index b38d0e5c617b9..b8bcb374ef961 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -1946,6 +1946,17 @@ package object config { .booleanConf .createWithDefault(false) + private[spark] val EXECUTOR_KILL_ON_FATAL_ERROR_DEPTH = + ConfigBuilder("spark.executor.killOnFatalError.depth") + .doc("The max depth of the exception chain in a failed task Spark will search for a fatal " + + "error to check whether it should kill an executor. 0 means not checking any fatal " + + "error, 1 means checking only the exception but not the cause, and so on.") + .internal() + .version("3.1.0") + .intConf + .checkValue(_ >= 0, "needs to be a non-negative value") + .createWithDefault(5) + private[spark] val PUSH_BASED_SHUFFLE_ENABLED = ConfigBuilder("spark.shuffle.push.enabled") .doc("Set to 'true' to enable push-based shuffle on the client side and this works in " + diff --git a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala index 31049d104e63d..1326ae3c11a06 100644 --- a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala +++ b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala @@ -28,6 +28,7 @@ import scala.collection.immutable import scala.collection.mutable.{ArrayBuffer, Map} import scala.concurrent.duration._ +import com.google.common.cache.{CacheBuilder, CacheLoader} import org.mockito.ArgumentCaptor import org.mockito.ArgumentMatchers.{any, eq => meq} import org.mockito.Mockito.{inOrder, verify, when} @@ -43,7 +44,7 @@ import org.apache.spark.TaskState.TaskState import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.config._ import org.apache.spark.internal.config.UI._ -import org.apache.spark.memory.TestMemoryManager +import org.apache.spark.memory.{SparkOutOfMemoryError, TestMemoryManager} import org.apache.spark.metrics.MetricsSystem import org.apache.spark.rdd.RDD import org.apache.spark.resource.ResourceInformation @@ -52,7 +53,7 @@ import org.apache.spark.scheduler.{DirectTaskResult, FakeTask, ResultTask, Task, import org.apache.spark.serializer.{JavaSerializer, SerializerInstance, SerializerManager} import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.storage.{BlockManager, BlockManagerId} -import org.apache.spark.util.{LongAccumulator, UninterruptibleThread} +import org.apache.spark.util.{LongAccumulator, ThreadUtils, UninterruptibleThread} class ExecutorSuite extends SparkFunSuite with LocalSparkContext with MockitoSugar with Eventually with PrivateMethodTester { @@ -402,6 +403,74 @@ class ExecutorSuite extends SparkFunSuite assert(taskMetrics.getMetricValue("JVMHeapMemory") > 0) } + test("SPARK-33587: isFatalError") { + def errorInThreadPool(e: => Throwable): Throwable = { + intercept[Throwable] { + val taskPool = ThreadUtils.newDaemonFixedThreadPool(1, "test") + try { + val f = taskPool.submit(new java.util.concurrent.Callable[String] { + override def call(): String = throw e + }) + f.get() + } finally { + taskPool.shutdown() + } + } + } + + def errorInGuavaCache(e: => Throwable): Throwable = { + val cache = CacheBuilder.newBuilder() + .build(new CacheLoader[String, String] { + override def load(key: String): String = throw e + }) + intercept[Throwable] { + cache.get("test") + } + } + + def testThrowable( + e: => Throwable, + depthToCheck: Int, + isFatal: Boolean): Unit = { + import Executor.isFatalError + // `e`'s depth is 1 so `depthToCheck` needs to be at least 3 to detect fatal errors. + assert(isFatalError(e, depthToCheck) == (depthToCheck >= 1 && isFatal)) + // `e`'s depth is 2 so `depthToCheck` needs to be at least 3 to detect fatal errors. + assert(isFatalError(errorInThreadPool(e), depthToCheck) == (depthToCheck >= 2 && isFatal)) + assert(isFatalError(errorInGuavaCache(e), depthToCheck) == (depthToCheck >= 2 && isFatal)) + assert(isFatalError( + new SparkException("foo", e), + depthToCheck) == (depthToCheck >= 2 && isFatal)) + // `e`'s depth is 3 so `depthToCheck` needs to be at least 3 to detect fatal errors. + assert(isFatalError( + errorInThreadPool(errorInGuavaCache(e)), + depthToCheck) == (depthToCheck >= 3 && isFatal)) + assert(isFatalError( + errorInGuavaCache(errorInThreadPool(e)), + depthToCheck) == (depthToCheck >= 3 && isFatal)) + assert(isFatalError( + new SparkException("foo", new SparkException("foo", e)), + depthToCheck) == (depthToCheck >= 3 && isFatal)) + } + + for (depthToCheck <- 0 to 5) { + testThrowable(new OutOfMemoryError(), depthToCheck, isFatal = true) + testThrowable(new InterruptedException(), depthToCheck, isFatal = false) + testThrowable(new RuntimeException("test"), depthToCheck, isFatal = false) + testThrowable(new SparkOutOfMemoryError("test"), depthToCheck, isFatal = false) + } + + // Verify we can handle the cycle in the exception chain + val e1 = new Exception("test1") + val e2 = new Exception("test2") + e1.initCause(e2) + e2.initCause(e1) + for (depthToCheck <- 0 to 5) { + testThrowable(e1, depthToCheck, isFatal = false) + testThrowable(e2, depthToCheck, isFatal = false) + } + } + private def createMockEnv(conf: SparkConf, serializer: JavaSerializer): SparkEnv = { val mockEnv = mock[SparkEnv] val mockRpcEnv = mock[RpcEnv] From 0054fc937f804660c6501d9d3f6319f3047a68f8 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Sun, 29 Nov 2020 12:10:16 -0800 Subject: [PATCH 036/150] [SPARK-33588][SQL] Respect the `spark.sql.caseSensitive` config while resolving partition spec in v1 `SHOW TABLE EXTENDED` ### What changes were proposed in this pull request? Perform partition spec normalization in `ShowTablesCommand` according to the table schema before getting partitions from the catalog. The normalization via `PartitioningUtils.normalizePartitionSpec()` adjusts the column names in partition specification, w.r.t. the real partition column names and case sensitivity. ### Why are the changes needed? Even when `spark.sql.caseSensitive` is `false` which is the default value, v1 `SHOW TABLE EXTENDED` is case sensitive: ```sql spark-sql> CREATE TABLE tbl1 (price int, qty int, year int, month int) > USING parquet > partitioned by (year, month); spark-sql> INSERT INTO tbl1 PARTITION(year = 2015, month = 1) SELECT 1, 1; spark-sql> SHOW TABLE EXTENDED LIKE 'tbl1' PARTITION(YEAR = 2015, Month = 1); Error in query: Partition spec is invalid. The spec (YEAR, Month) must match the partition spec (year, month) defined in table '`default`.`tbl1`'; ``` ### Does this PR introduce _any_ user-facing change? Yes. After the changes, the `SHOW TABLE EXTENDED` command respects the SQL config. And for example above, it returns correct result: ```sql spark-sql> SHOW TABLE EXTENDED LIKE 'tbl1' PARTITION(YEAR = 2015, Month = 1); default tbl1 false Partition Values: [year=2015, month=1] Location: file:/Users/maximgekk/spark-warehouse/tbl1/year=2015/month=1 Serde Library: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe InputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat OutputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat Storage Properties: [serialization.format=1, path=file:/Users/maximgekk/spark-warehouse/tbl1] Partition Parameters: {transient_lastDdlTime=1606595118, totalSize=623, numFiles=1} Created Time: Sat Nov 28 23:25:18 MSK 2020 Last Access: UNKNOWN Partition Statistics: 623 bytes ``` ### How was this patch tested? By running the modified test suite `v1/ShowTablesSuite` Closes #30529 from MaxGekk/show-table-case-sensitive-spec. Authored-by: Max Gekk Signed-off-by: Dongjoon Hyun --- .../spark/sql/execution/command/tables.scala | 17 ++++++++----- .../sql-tests/results/show-tables.sql.out | 2 +- .../command/v1/ShowTablesSuite.scala | 25 +++++++++++++++++++ 3 files changed, 37 insertions(+), 7 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index bd238948aab02..9e3ca3c321a54 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -878,12 +878,17 @@ case class ShowTablesCommand( // // Note: tableIdentifierPattern should be non-empty, otherwise a [[ParseException]] // should have been thrown by the sql parser. - val tableIdent = TableIdentifier(tableIdentifierPattern.get, Some(db)) - val table = catalog.getTableMetadata(tableIdent).identifier - val partition = catalog.getPartition(tableIdent, partitionSpec.get) - val database = table.database.getOrElse("") - val tableName = table.table - val isTemp = catalog.isTemporaryTable(table) + val table = catalog.getTableMetadata(TableIdentifier(tableIdentifierPattern.get, Some(db))) + val tableIdent = table.identifier + val normalizedSpec = PartitioningUtils.normalizePartitionSpec( + partitionSpec.get, + table.partitionColumnNames, + tableIdent.quotedString, + sparkSession.sessionState.conf.resolver) + val partition = catalog.getPartition(tableIdent, normalizedSpec) + val database = tableIdent.database.getOrElse("") + val tableName = tableIdent.table + val isTemp = catalog.isTemporaryTable(tableIdent) val information = partition.simpleString Seq(Row(database, tableName, isTemp, s"$information\n")) } diff --git a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out index a95b02c7f7743..60c5e6d5642b7 100644 --- a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out @@ -224,7 +224,7 @@ SHOW TABLE EXTENDED LIKE 'show_t1' PARTITION(a='Us', d=1) struct<> -- !query output org.apache.spark.sql.AnalysisException -Partition spec is invalid. The spec (a, d) must match the partition spec (c, d) defined in table '`showdb`.`show_t1`'; +a is not a valid partition column in table `showdb`.`show_t1`.; -- !query diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowTablesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowTablesSuite.scala index 5bbc6c6285193..8f29f9f276138 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowTablesSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowTablesSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.command.v1 import org.apache.spark.sql.{AnalysisException, Row} import org.apache.spark.sql.connector.catalog.CatalogManager import org.apache.spark.sql.execution.command +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{BooleanType, StringType, StructType} @@ -84,6 +85,30 @@ trait ShowTablesSuiteBase extends command.ShowTablesSuiteBase { result.foreach { case Row(_, _, _, info: String) => assert(info.nonEmpty) } } } + + test("case sensitivity of partition spec") { + withNamespace(s"$catalog.ns") { + sql(s"CREATE NAMESPACE $catalog.ns") + val t = s"$catalog.ns.part_table" + withTable(t) { + sql(s""" + |CREATE TABLE $t (price int, qty int, year int, month int) + |$defaultUsing + |partitioned by (year, month)""".stripMargin) + sql(s"INSERT INTO $t PARTITION(year = 2015, month = 1) SELECT 1, 1") + Seq( + true -> "PARTITION(year = 2015, month = 1)", + false -> "PARTITION(YEAR = 2015, Month = 1)" + ).foreach { case (caseSensitive, partitionSpec) => + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + val df = sql(s"SHOW TABLE EXTENDED LIKE 'part_table' $partitionSpec") + val information = df.select("information").first().getString(0) + assert(information.contains("Partition Values: [year=2015, month=1]")) + } + } + } + } + } } class ShowTablesSuite extends ShowTablesSuiteBase with SharedSparkSession From a088a801ed8c17171545c196a3f26ce415de0cd1 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Sun, 29 Nov 2020 12:18:07 -0800 Subject: [PATCH 037/150] [SPARK-33585][SQL][DOCS] Fix the comment for `SQLContext.tables()` and mention the `database` column ### What changes were proposed in this pull request? Change the comments for `SQLContext.tables()` to "The returned DataFrame has three columns, database, tableName and isTemporary". ### Why are the changes needed? Currently, the comment mentions only 2 columns but `tables()` returns 3 columns actually: ```scala scala> spark.range(10).createOrReplaceTempView("view1") scala> val tables = spark.sqlContext.tables() tables: org.apache.spark.sql.DataFrame = [database: string, tableName: string ... 1 more field] scala> tables.printSchema root |-- database: string (nullable = false) |-- tableName: string (nullable = false) |-- isTemporary: boolean (nullable = false) scala> tables.show +--------+---------+-----------+ |database|tableName|isTemporary| +--------+---------+-----------+ | default| t1| false| | default| t2| false| | default| ymd| false| | | view1| true| +--------+---------+-----------+ ``` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? By running `./dev/scalastyle` Closes #30526 from MaxGekk/sqlcontext-tables-doc. Authored-by: Max Gekk Signed-off-by: Dongjoon Hyun --- sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 7cf0b6bb70364..dd237962110ef 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -661,7 +661,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * Returns a `DataFrame` containing names of existing tables in the current database. - * The returned DataFrame has two columns, tableName and isTemporary (a Boolean + * The returned DataFrame has three columns, database, tableName and isTemporary (a Boolean * indicating if a table is a temporary one or not). * * @group ddl_ops @@ -673,7 +673,7 @@ class SQLContext private[sql](val sparkSession: SparkSession) /** * Returns a `DataFrame` containing names of existing tables in the given database. - * The returned DataFrame has two columns, tableName and isTemporary (a Boolean + * The returned DataFrame has three columns, database, tableName and isTemporary (a Boolean * indicating if a table is a temporary one or not). * * @group ddl_ops From 3d54774fb9cbf674580851aa2323991c7e462a1e Mon Sep 17 00:00:00 2001 From: liucht Date: Mon, 30 Nov 2020 10:03:18 +0900 Subject: [PATCH 038/150] [SPARK-33517][SQL][DOCS] Fix the correct menu items and page links in PySpark Usage Guide for Pandas with Apache Arrow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes were proposed in this pull request? Change "Apache Arrow in Spark" to "Apache Arrow in PySpark" and the link to “/sql-pyspark-pandas-with-arrow.html#apache-arrow-in-pyspark” ### Why are the changes needed? When I click on the menu item it doesn't point to the correct page, and from the parent menu I can infer that the correct menu item name and link should be "Apache Arrow in PySpark". like this: image ![image](https://user-images.githubusercontent.com/28332082/99954725-2b64e200-2dbe-11eb-9576-cf6a3d758980.png) ### Does this PR introduce any user-facing change? Yes, clicking on the menu item will take you to the correct guide page ### How was this patch tested? Manually build the doc. This can be verified as below: cd docs SKIP_API=1 jekyll build open _site/sql-pyspark-pandas-with-arrow.html Closes #30466 from liucht-inspur/master. Authored-by: liucht Signed-off-by: HyukjinKwon --- docs/_data/menu-sql.yaml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/docs/_data/menu-sql.yaml b/docs/_data/menu-sql.yaml index ec0b404fe672f..cda2a1a5139a1 100644 --- a/docs/_data/menu-sql.yaml +++ b/docs/_data/menu-sql.yaml @@ -64,17 +64,6 @@ url: sql-distributed-sql-engine.html#running-the-spark-sql-cli - text: PySpark Usage Guide for Pandas with Apache Arrow url: sql-pyspark-pandas-with-arrow.html - subitems: - - text: Apache Arrow in Spark - url: sql-pyspark-pandas-with-arrow.html#apache-arrow-in-spark - - text: "Enabling for Conversion to/from Pandas" - url: sql-pyspark-pandas-with-arrow.html#enabling-for-conversion-tofrom-pandas - - text: "Pandas UDFs (a.k.a. Vectorized UDFs)" - url: sql-pyspark-pandas-with-arrow.html#pandas-udfs-aka-vectorized-udfs - - text: "Pandas Function APIs" - url: sql-pyspark-pandas-with-arrow.html#pandas-function-apis - - text: Usage Notes - url: sql-pyspark-pandas-with-arrow.html#usage-notes - text: Migration Guide url: sql-migration-old.html - text: SQL Reference From f93d4395b25ea546cebb1ff16879dea696a217b5 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Mon, 30 Nov 2020 11:21:02 +0900 Subject: [PATCH 039/150] [SPARK-33589][SQL] Close opened session if the initialization fails ### What changes were proposed in this pull request? This pr add try catch when opening session. ### Why are the changes needed? Close opened session if the initialization fails. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manual test. Before this pr: ``` [rootspark-3267648 spark]# bin/beeline -u jdbc:hive2://localhost:10000/db_not_exist NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark classes ahead of assembly. Connecting to jdbc:hive2://localhost:10000/db_not_exist log4j:WARN No appenders could be found for logger (org.apache.hive.jdbc.Utils). log4j:WARN Please initialize the log4j system properly. log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info. Error: Could not open client transport with JDBC Uri: jdbc:hive2://localhost:10000/db_not_exist: Database 'db_not_exist' not found; (state=08S01,code=0) Beeline version 2.3.7 by Apache Hive beeline> ``` ![image](https://user-images.githubusercontent.com/5399861/100560975-73ba5d80-32f2-11eb-8f92-b2509e7a121f.png) After this pr: ``` [rootspark-3267648 spark]# bin/beeline -u jdbc:hive2://localhost:10000/db_not_exist NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark classes ahead of assembly. log4j:WARN No appenders could be found for logger (org.apache.hadoop.util.Shell). log4j:WARN Please initialize the log4j system properly. log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info. Connecting to jdbc:hive2://localhost:10000/db_not_exist Error: Could not open client transport with JDBC Uri: jdbc:hive2://localhost:10000/db_not_exist: Failed to open new session: org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: Database 'db_not_exist' not found; (state=08S01,code=0) Beeline version 2.3.7 by Apache Hive beeline> ``` ![image](https://user-images.githubusercontent.com/5399861/100560917-479edc80-32f2-11eb-986f-7a997f1163fc.png) Closes #30536 from wangyum/SPARK-33589. Authored-by: Yuming Wang Signed-off-by: HyukjinKwon --- .../thriftserver/SparkSQLSessionManager.scala | 50 ++++++++++++------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala index 856edede0b85f..0c092abb37f3e 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala @@ -18,11 +18,12 @@ package org.apache.spark.sql.hive.thriftserver import org.apache.hadoop.hive.conf.HiveConf -import org.apache.hive.service.cli.SessionHandle +import org.apache.hive.service.cli.{HiveSQLException, SessionHandle} import org.apache.hive.service.cli.session.SessionManager import org.apache.hive.service.rpc.thrift.TProtocolVersion import org.apache.hive.service.server.HiveServer2 +import org.apache.spark.internal.Logging import org.apache.spark.sql.SQLContext import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._ @@ -32,7 +33,7 @@ import org.apache.spark.sql.internal.SQLConf private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, sqlContext: SQLContext) extends SessionManager(hiveServer) - with ReflectedCompositeService { + with ReflectedCompositeService with Logging { private lazy val sparkSqlOperationManager = new SparkSQLOperationManager() @@ -52,24 +53,35 @@ private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, sqlContext: val sessionHandle = super.openSession(protocol, username, passwd, ipAddress, sessionConf, withImpersonation, delegationToken) - val session = super.getSession(sessionHandle) - HiveThriftServer2.eventManager.onSessionCreated( - session.getIpAddress, sessionHandle.getSessionId.toString, session.getUsername) - val ctx = if (sqlContext.conf.hiveThriftServerSingleSession) { - sqlContext - } else { - sqlContext.newSession() + try { + val session = super.getSession(sessionHandle) + HiveThriftServer2.eventManager.onSessionCreated( + session.getIpAddress, sessionHandle.getSessionId.toString, session.getUsername) + val ctx = if (sqlContext.conf.hiveThriftServerSingleSession) { + sqlContext + } else { + sqlContext.newSession() + } + ctx.setConf(HiveUtils.FAKE_HIVE_VERSION.key, HiveUtils.builtinHiveVersion) + ctx.setConf(SQLConf.DATETIME_JAVA8API_ENABLED, true) + val hiveSessionState = session.getSessionState + setConfMap(ctx, hiveSessionState.getOverriddenConfigurations) + setConfMap(ctx, hiveSessionState.getHiveVariables) + if (sessionConf != null && sessionConf.containsKey("use:database")) { + ctx.sql(s"use ${sessionConf.get("use:database")}") + } + sparkSqlOperationManager.sessionToContexts.put(sessionHandle, ctx) + sessionHandle + } catch { + case e: Exception => + try { + closeSession(sessionHandle) + } catch { + case t: Throwable => + logWarning("Error closing session", t) + } + throw new HiveSQLException("Failed to open new session: " + e, e) } - ctx.setConf(HiveUtils.FAKE_HIVE_VERSION.key, HiveUtils.builtinHiveVersion) - ctx.setConf(SQLConf.DATETIME_JAVA8API_ENABLED, true) - val hiveSessionState = session.getSessionState - setConfMap(ctx, hiveSessionState.getOverriddenConfigurations) - setConfMap(ctx, hiveSessionState.getHiveVariables) - if (sessionConf != null && sessionConf.containsKey("use:database")) { - ctx.sql(s"use ${sessionConf.get("use:database")}") - } - sparkSqlOperationManager.sessionToContexts.put(sessionHandle, ctx) - sessionHandle } override def closeSession(sessionHandle: SessionHandle): Unit = { From a5e13acd19871831a93a5bdcbc99a9eb9f1aba07 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Mon, 30 Nov 2020 11:24:15 +0900 Subject: [PATCH 040/150] [SPARK-33582][SQL] Hive Metastore support filter by not-equals ### What changes were proposed in this pull request? This pr make partition predicate pushdown into Hive metastore support not-equals operator. Hive related changes: https://github.com/apache/hive/blob/b8bd4594bef718b1eeac9fceb437d7df7b480ed1/itests/hive-unit/src/test/java/org/apache/hadoop/hive/metastore/TestHiveMetaStore.java#L2194-L2207 https://issues.apache.org/jira/browse/HIVE-2702 ### Why are the changes needed? Improve query performance. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit test. Closes #30534 from wangyum/SPARK-33582. Authored-by: Yuming Wang Signed-off-by: HyukjinKwon --- .../spark/sql/hive/client/HiveShim.scala | 8 ++++++++ .../spark/sql/hive/client/FiltersSuite.scala | 8 ++++++++ .../client/HivePartitionFilteringSuite.scala | 20 +++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala index 17a64a67df283..ed088648bc20a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala @@ -812,6 +812,14 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { right <- convert(expr2) } yield s"($left or $right)" + case Not(EqualTo( + ExtractAttribute(SupportedAttribute(name)), ExtractableLiteral(value))) if useAdvanced => + Some(s"$name != $value") + + case Not(EqualTo( + ExtractableLiteral(value), ExtractAttribute(SupportedAttribute(name)))) if useAdvanced => + Some(s"$value != $name") + case _ => None } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala index 6c0531182e6d6..12ed0e5305299 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala @@ -100,6 +100,14 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest { (a("intcol", IntegerType) in (Literal(1), Literal(null))) :: Nil, "(intcol = 1)") + filterTest("NOT: int and string filters", + (a("intcol", IntegerType) =!= Literal(1)) :: (Literal("a") =!= a("strcol", IntegerType)) :: Nil, + """intcol != 1 and "a" != strcol""") + + filterTest("NOT: date filter", + (a("datecol", DateType) =!= Literal(Date.valueOf("2019-01-01"))) :: Nil, + "datecol != 2019-01-01") + // Applying the predicate `x IN (NULL)` should return an empty set, but since this optimization // will be applied by Catalyst, this filter converter does not need to account for this. filterTest("SPARK-24879 IN predicates with only NULLs will not cause a NPE", diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala index e07fbc29ee8aa..dc56e6bc4da81 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala @@ -352,6 +352,26 @@ class HivePartitionFilteringSuite(version: String) dateStrValue) } + test("getPartitionsByFilter: ds<>20170101") { + testMetastorePartitionFiltering( + attr("ds") =!= 20170101, + 20170102 to 20170103, + hValue, + chunkValue, + dateValue, + dateStrValue) + } + + test("getPartitionsByFilter: h<>0 and chunk<>ab and d<>2019-01-01") { + testMetastorePartitionFiltering( + attr("h") =!= 0 && attr("chunk") =!= "ab" && attr("d") =!= Date.valueOf("2019-01-01"), + dsValue, + 1 to 4, + Seq("aa", "ba", "bb"), + Seq("2019-01-02", "2019-01-03"), + dateStrValue) + } + test("getPartitionsByFilter: d=2019-01-01") { testMetastorePartitionFiltering( attr("d") === Date.valueOf("2019-01-01"), From feda7299e3d8ebe665b8fae0328f22a4927c66da Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Mon, 30 Nov 2020 04:50:50 +0000 Subject: [PATCH 041/150] [SPARK-33567][SQL] DSv2: Use callback instead of passing Spark session and v2 relation for refreshing cache ### What changes were proposed in this pull request? This replaces Spark session and `DataSourceV2Relation` in V2 write plans by replacing them with a callback `afterWrite`. ### Why are the changes needed? Per discussion in #30429, it's better to not pass Spark session and `DataSourceV2Relation` through Spark plans. Instead we can use a callback which makes the interface cleaner. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? N/A Closes #30491 from sunchao/SPARK-33492-followup. Authored-by: Chao Sun Signed-off-by: Wenchen Fan --- .../datasources/v2/DataSourceV2Strategy.scala | 26 +++++++++++++------ .../datasources/v2/DropTableExec.scala | 11 +++----- .../datasources/v2/RefreshTableExec.scala | 11 +++----- .../datasources/v2/V1FallbackWriters.scala | 15 ++++++----- .../v2/WriteToDataSourceV2Exec.scala | 21 +++++++-------- 5 files changed, 43 insertions(+), 41 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index eb0d7010041b9..1fae8d937e90c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -52,6 +52,15 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat } } + private def refreshCache(r: DataSourceV2Relation)(): Unit = { + session.sharedState.cacheManager.recacheByPlan(session, r) + } + + private def invalidateCache(r: ResolvedTable)(): Unit = { + val v2Relation = DataSourceV2Relation.create(r.table, Some(r.catalog), Some(r.identifier)) + session.sharedState.cacheManager.uncacheQuery(session, v2Relation, cascade = true) + } + override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case PhysicalOperation(project, filters, relation @ DataSourceV2ScanRelation(_, V1ScanWrapper(scan, translated, pushed), output)) => @@ -128,7 +137,7 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat } case RefreshTable(r: ResolvedTable) => - RefreshTableExec(session, r.catalog, r.table, r.identifier) :: Nil + RefreshTableExec(r.catalog, r.identifier, invalidateCache(r)) :: Nil case ReplaceTable(catalog, ident, schema, parts, props, orCreate) => val propsWithOwner = CatalogV2Util.withDefaultOwnership(props) @@ -172,9 +181,9 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat case AppendData(r: DataSourceV2Relation, query, writeOptions, _) => r.table.asWritable match { case v1 if v1.supports(TableCapability.V1_BATCH_WRITE) => - AppendDataExecV1(v1, writeOptions.asOptions, query, r) :: Nil + AppendDataExecV1(v1, writeOptions.asOptions, query, refreshCache(r)) :: Nil case v2 => - AppendDataExec(session, v2, r, writeOptions.asOptions, planLater(query)) :: Nil + AppendDataExec(v2, writeOptions.asOptions, planLater(query), refreshCache(r)) :: Nil } case OverwriteByExpression(r: DataSourceV2Relation, deleteExpr, query, writeOptions, _) => @@ -186,15 +195,16 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat }.toArray r.table.asWritable match { case v1 if v1.supports(TableCapability.V1_BATCH_WRITE) => - OverwriteByExpressionExecV1(v1, filters, writeOptions.asOptions, query, r) :: Nil + OverwriteByExpressionExecV1(v1, filters, writeOptions.asOptions, + query, refreshCache(r)) :: Nil case v2 => - OverwriteByExpressionExec(session, v2, r, filters, - writeOptions.asOptions, planLater(query)) :: Nil + OverwriteByExpressionExec(v2, filters, + writeOptions.asOptions, planLater(query), refreshCache(r)) :: Nil } case OverwritePartitionsDynamic(r: DataSourceV2Relation, query, writeOptions, _) => OverwritePartitionsDynamicExec( - session, r.table.asWritable, r, writeOptions.asOptions, planLater(query)) :: Nil + r.table.asWritable, writeOptions.asOptions, planLater(query), refreshCache(r)) :: Nil case DeleteFromTable(relation, condition) => relation match { @@ -232,7 +242,7 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat throw new AnalysisException("Describing columns is not supported for v2 tables.") case DropTable(r: ResolvedTable, ifExists, purge) => - DropTableExec(session, r.catalog, r.table, r.identifier, ifExists, purge) :: Nil + DropTableExec(r.catalog, r.identifier, ifExists, purge, invalidateCache(r)) :: Nil case _: NoopDropTable => LocalTableScanExec(Nil, Nil) :: Nil diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropTableExec.scala index 068475fc56f47..f89b89096772a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropTableExec.scala @@ -17,27 +17,24 @@ package org.apache.spark.sql.execution.datasources.v2 -import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.connector.catalog.{Identifier, Table, TableCatalog} +import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog} /** * Physical plan node for dropping a table. */ case class DropTableExec( - session: SparkSession, catalog: TableCatalog, - table: Table, ident: Identifier, ifExists: Boolean, - purge: Boolean) extends V2CommandExec { + purge: Boolean, + invalidateCache: () => Unit) extends V2CommandExec { override def run(): Seq[InternalRow] = { if (catalog.tableExists(ident)) { - val v2Relation = DataSourceV2Relation.create(table, Some(catalog), Some(ident)) - session.sharedState.cacheManager.uncacheQuery(session, v2Relation, cascade = true) + invalidateCache() catalog.dropTable(ident, purge) } else if (!ifExists) { throw new NoSuchTableException(ident) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RefreshTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RefreshTableExec.scala index 52836de5a926b..994583c1e338f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RefreshTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RefreshTableExec.scala @@ -17,23 +17,20 @@ package org.apache.spark.sql.execution.datasources.v2 -import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.connector.catalog.{Identifier, Table, TableCatalog} +import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog} case class RefreshTableExec( - session: SparkSession, catalog: TableCatalog, - table: Table, - ident: Identifier) extends V2CommandExec { + ident: Identifier, + invalidateCache: () => Unit) extends V2CommandExec { override protected def run(): Seq[InternalRow] = { catalog.invalidateTable(ident) // invalidate all caches referencing the given table // TODO(SPARK-33437): re-cache the table itself once we support caching a DSv2 table - val v2Relation = DataSourceV2Relation.create(table, Some(catalog), Some(ident)) - session.sharedState.cacheManager.uncacheQuery(session, v2Relation, cascade = true) + invalidateCache() Seq.empty } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V1FallbackWriters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V1FallbackWriters.scala index af7721588edeb..9d2cea9fbaff3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V1FallbackWriters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V1FallbackWriters.scala @@ -38,10 +38,10 @@ case class AppendDataExecV1( table: SupportsWrite, writeOptions: CaseInsensitiveStringMap, plan: LogicalPlan, - v2Relation: DataSourceV2Relation) extends V1FallbackWriters { + refreshCache: () => Unit) extends V1FallbackWriters { override protected def run(): Seq[InternalRow] = { - writeWithV1(newWriteBuilder().buildForV1Write(), Some(v2Relation)) + writeWithV1(newWriteBuilder().buildForV1Write(), refreshCache = refreshCache) } } @@ -61,7 +61,7 @@ case class OverwriteByExpressionExecV1( deleteWhere: Array[Filter], writeOptions: CaseInsensitiveStringMap, plan: LogicalPlan, - v2Relation: DataSourceV2Relation) extends V1FallbackWriters { + refreshCache: () => Unit) extends V1FallbackWriters { private def isTruncate(filters: Array[Filter]): Boolean = { filters.length == 1 && filters(0).isInstanceOf[AlwaysTrue] @@ -70,10 +70,11 @@ case class OverwriteByExpressionExecV1( override protected def run(): Seq[InternalRow] = { newWriteBuilder() match { case builder: SupportsTruncate if isTruncate(deleteWhere) => - writeWithV1(builder.truncate().asV1Builder.buildForV1Write(), Some(v2Relation)) + writeWithV1(builder.truncate().asV1Builder.buildForV1Write(), refreshCache = refreshCache) case builder: SupportsOverwrite => - writeWithV1(builder.overwrite(deleteWhere).asV1Builder.buildForV1Write(), Some(v2Relation)) + writeWithV1(builder.overwrite(deleteWhere).asV1Builder.buildForV1Write(), + refreshCache = refreshCache) case _ => throw new SparkException(s"Table does not support overwrite by expression: $table") @@ -116,11 +117,11 @@ trait SupportsV1Write extends SparkPlan { protected def writeWithV1( relation: InsertableRelation, - v2Relation: Option[DataSourceV2Relation] = None): Seq[InternalRow] = { + refreshCache: () => Unit = () => ()): Seq[InternalRow] = { val session = sqlContext.sparkSession // The `plan` is already optimized, we should not analyze and optimize it again. relation.insert(AlreadyOptimized.dataFrame(session, plan), overwrite = false) - v2Relation.foreach(r => session.sharedState.cacheManager.recacheByPlan(session, r)) + refreshCache() Nil } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala index 1648134d0a1b2..47aad2bcb2c56 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala @@ -213,15 +213,14 @@ case class AtomicReplaceTableAsSelectExec( * Rows in the output data set are appended. */ case class AppendDataExec( - session: SparkSession, table: SupportsWrite, - relation: DataSourceV2Relation, writeOptions: CaseInsensitiveStringMap, - query: SparkPlan) extends V2TableWriteExec with BatchWriteHelper { + query: SparkPlan, + refreshCache: () => Unit) extends V2TableWriteExec with BatchWriteHelper { override protected def run(): Seq[InternalRow] = { val writtenRows = writeWithV2(newWriteBuilder().buildForBatch()) - session.sharedState.cacheManager.recacheByPlan(session, relation) + refreshCache() writtenRows } } @@ -237,12 +236,11 @@ case class AppendDataExec( * AlwaysTrue to delete all rows. */ case class OverwriteByExpressionExec( - session: SparkSession, table: SupportsWrite, - relation: DataSourceV2Relation, deleteWhere: Array[Filter], writeOptions: CaseInsensitiveStringMap, - query: SparkPlan) extends V2TableWriteExec with BatchWriteHelper { + query: SparkPlan, + refreshCache: () => Unit) extends V2TableWriteExec with BatchWriteHelper { private def isTruncate(filters: Array[Filter]): Boolean = { filters.length == 1 && filters(0).isInstanceOf[AlwaysTrue] @@ -259,7 +257,7 @@ case class OverwriteByExpressionExec( case _ => throw new SparkException(s"Table does not support overwrite by expression: $table") } - session.sharedState.cacheManager.recacheByPlan(session, relation) + refreshCache() writtenRows } } @@ -275,11 +273,10 @@ case class OverwriteByExpressionExec( * are not modified. */ case class OverwritePartitionsDynamicExec( - session: SparkSession, table: SupportsWrite, - relation: DataSourceV2Relation, writeOptions: CaseInsensitiveStringMap, - query: SparkPlan) extends V2TableWriteExec with BatchWriteHelper { + query: SparkPlan, + refreshCache: () => Unit) extends V2TableWriteExec with BatchWriteHelper { override protected def run(): Seq[InternalRow] = { val writtenRows = newWriteBuilder() match { @@ -289,7 +286,7 @@ case class OverwritePartitionsDynamicExec( case _ => throw new SparkException(s"Table does not support dynamic partition overwrite: $table") } - session.sharedState.cacheManager.recacheByPlan(session, relation) + refreshCache() writtenRows } } From 485145326a9c97ede260b0e267ee116f182cfd56 Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Mon, 30 Nov 2020 13:59:51 +0900 Subject: [PATCH 042/150] [MINOR] Spelling bin core docs external mllib repl ### What changes were proposed in this pull request? This PR intends to fix typos in the sub-modules: * `bin` * `core` * `docs` * `external` * `mllib` * `repl` * `pom.xml` Split per srowen https://github.com/apache/spark/pull/30323#issuecomment-728981618 NOTE: The misspellings have been reported at https://github.com/jsoref/spark/commit/706a726f87a0bbf5e31467fae9015218773db85b#commitcomment-44064356 ### Why are the changes needed? Misspelled words make it harder to read / understand content. ### Does this PR introduce _any_ user-facing change? There are various fixes to documentation, etc... ### How was this patch tested? No testing was performed Closes #30530 from jsoref/spelling-bin-core-docs-external-mllib-repl. Authored-by: Josh Soref Signed-off-by: Takeshi Yamamuro --- bin/docker-image-tool.sh | 2 +- .../apache/spark/ui/static/spark-dag-viz.js | 2 +- .../org/apache/spark/ui/static/utils.js | 2 +- .../spark/ExecutorAllocationManager.scala | 4 +- .../apache/spark/api/java/JavaPairRDD.scala | 4 +- .../apache/spark/api/java/JavaRDDLike.scala | 2 +- .../apache/spark/api/python/PythonRDD.scala | 6 +- .../apache/spark/deploy/JsonProtocol.scala | 2 +- .../org/apache/spark/deploy/SparkSubmit.scala | 2 +- .../deploy/history/FsHistoryProvider.scala | 2 +- .../spark/deploy/history/HybridStore.scala | 2 +- .../org/apache/spark/executor/Executor.scala | 4 +- .../apache/spark/metrics/MetricsConfig.scala | 2 +- .../metrics/sink/PrometheusServlet.scala | 6 +- .../apache/spark/rdd/DoubleRDDFunctions.scala | 2 +- .../spark/rdd/OrderedRDDFunctions.scala | 4 +- .../main/scala/org/apache/spark/rdd/RDD.scala | 2 +- .../spark/resource/TaskResourceRequest.scala | 2 +- .../apache/spark/rpc/netty/NettyRpcEnv.scala | 4 +- .../BarrierJobAllocationFailed.scala | 4 +- .../apache/spark/scheduler/DAGScheduler.scala | 8 +- .../spark/scheduler/HealthTracker.scala | 4 +- .../spark/scheduler/TaskSetManager.scala | 2 +- .../spark/security/CryptoStreamUtils.scala | 2 +- .../apache/spark/storage/BlockManager.scala | 4 +- .../storage/BlockManagerMasterEndpoint.scala | 2 +- .../apache/spark/ui/jobs/AllJobsPage.scala | 2 +- .../org/apache/spark/ui/jobs/JobPage.scala | 2 +- .../apache/spark/util/ClosureCleaner.scala | 2 +- .../scala/org/apache/spark/util/Utils.scala | 22 ++-- .../spark/util/io/ChunkedByteBuffer.scala | 2 +- .../sort/UnsafeShuffleWriterSuite.java | 10 +- .../test/org/apache/spark/JavaAPISuite.java | 2 +- .../org/apache/spark/CheckpointSuite.scala | 12 +- .../apache/spark/ContextCleanerSuite.scala | 10 +- .../ExecutorAllocationManagerSuite.scala | 2 +- .../scala/org/apache/spark/FileSuite.scala | 2 +- .../spark/benchmark/BenchmarkBase.scala | 2 +- .../history/FsHistoryProviderSuite.scala | 4 +- .../spark/deploy/master/MasterSuite.scala | 2 +- .../spark/deploy/worker/WorkerSuite.scala | 2 +- .../apache/spark/executor/ExecutorSuite.scala | 2 +- ...FileCommitProtocolInstantiationSuite.scala | 4 +- .../metrics/InputOutputMetricsSuite.scala | 2 +- .../NettyBlockTransferServiceSuite.scala | 2 +- .../spark/rdd/PairRDDFunctionsSuite.scala | 34 +++--- .../scala/org/apache/spark/rdd/RDDSuite.scala | 2 +- .../spark/resource/ResourceUtilsSuite.scala | 2 +- .../spark/rpc/netty/NettyRpcEnvSuite.scala | 2 +- .../spark/scheduler/DAGSchedulerSuite.scala | 6 +- .../spark/scheduler/ReplayListenerSuite.scala | 2 +- .../scheduler/SchedulerIntegrationSuite.scala | 8 +- .../spark/scheduler/SparkListenerSuite.scala | 6 +- .../spark/scheduler/TaskSetManagerSuite.scala | 6 +- .../spark/status/AppStatusListenerSuite.scala | 2 +- .../spark/storage/BlockManagerSuite.scala | 4 +- .../apache/spark/util/JsonProtocolSuite.scala | 8 +- .../spark/util/SizeEstimatorSuite.scala | 2 +- docs/_plugins/include_example.rb | 4 +- docs/building-spark.md | 2 +- docs/configuration.md | 2 +- docs/css/main.css | 4 +- docs/graphx-programming-guide.md | 4 +- docs/ml-migration-guide.md | 2 +- docs/mllib-clustering.md | 2 +- docs/mllib-data-types.md | 2 +- docs/monitoring.md | 6 +- docs/running-on-kubernetes.md | 4 +- docs/running-on-mesos.md | 2 +- docs/running-on-yarn.md | 2 +- docs/sparkr.md | 2 +- docs/sql-data-sources-jdbc.md | 2 +- docs/sql-migration-guide.md | 6 +- ...l-ref-syntax-aux-conf-mgmt-set-timezone.md | 2 +- ...-ref-syntax-ddl-create-table-hiveformat.md | 8 +- docs/sql-ref-syntax-dml-insert-into.md | 114 +++++++++--------- ...l-ref-syntax-dml-insert-overwrite-table.md | 52 ++++---- docs/sql-ref-syntax-qry-select-groupby.md | 4 +- .../sql-ref-syntax-qry-select-lateral-view.md | 6 +- docs/sql-ref-syntax-qry-select-orderby.md | 2 +- .../ml/evaluation/ClusteringMetrics.scala | 4 +- .../apache/spark/ml/feature/Binarizer.scala | 6 +- .../apache/spark/ml/feature/Selector.scala | 2 +- .../spark/ml/feature/StopWordsRemover.scala | 6 +- .../apache/spark/ml/image/ImageSchema.scala | 2 +- .../ml/r/AFTSurvivalRegressionWrapper.scala | 4 +- .../spark/ml/regression/FMRegressor.scala | 2 +- .../spark/mllib/classification/SVM.scala | 2 +- .../mllib/clustering/DistanceMeasure.scala | 6 +- .../spark/mllib/clustering/LDAOptimizer.scala | 2 +- .../mllib/clustering/StreamingKMeans.scala | 2 +- .../org/apache/spark/mllib/feature/PCA.scala | 4 +- .../apache/spark/mllib/feature/Word2Vec.scala | 2 +- .../spark/mllib/fpm/AssociationRules.scala | 4 +- .../mllib/linalg/distributed/RowMatrix.scala | 4 +- .../stat/test/KolmogorovSmirnovTest.scala | 2 +- .../ml/feature/JavaStopWordsRemoverSuite.java | 2 +- .../ml/clustering/GaussianMixtureSuite.scala | 2 +- .../evaluation/RegressionEvaluatorSuite.scala | 2 +- .../spark/ml/feature/ANOVASelectorSuite.scala | 10 +- .../apache/spark/ml/feature/DCTSuite.scala | 2 +- .../org/apache/spark/ml/feature/LSHTest.scala | 2 +- .../VarianceThresholdSelectorSuite.scala | 2 +- .../GeneralizedLinearRegressionSuite.scala | 4 +- pom.xml | 4 +- .../spark/repl/ExecutorClassLoaderSuite.scala | 5 +- 106 files changed, 288 insertions(+), 289 deletions(-) diff --git a/bin/docker-image-tool.sh b/bin/docker-image-tool.sh index 6d74f8328aea2..2ec1ab8861798 100755 --- a/bin/docker-image-tool.sh +++ b/bin/docker-image-tool.sh @@ -274,7 +274,7 @@ Examples: - Build and push JDK11-based image for multiple archs to docker.io/myrepo $0 -r docker.io/myrepo -t v3.0.0 -X -b java_image_tag=11-jre-slim build # Note: buildx, which does cross building, needs to do the push during build - # So there is no seperate push step with -X + # So there is no separate push step with -X EOF } diff --git a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js index 474c453643365..1fc1fb4b4513b 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js +++ b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js @@ -334,7 +334,7 @@ function preprocessGraphLayout(g, forJob) { } /* - * Helper function to size the SVG appropriately such that all elements are displyed. + * Helper function to size the SVG appropriately such that all elements are displayed. * This assumes that all outermost elements are clusters (rectangles). */ function resizeSvg(svg) { diff --git a/core/src/main/resources/org/apache/spark/ui/static/utils.js b/core/src/main/resources/org/apache/spark/ui/static/utils.js index 4cd83332cde5f..7e6dd678e2641 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/utils.js +++ b/core/src/main/resources/org/apache/spark/ui/static/utils.js @@ -74,7 +74,7 @@ function getTimeZone() { return Intl.DateTimeFormat().resolvedOptions().timeZone; } catch(ex) { // Get time zone from a string representing the date, - // eg. "Thu Nov 16 2017 01:13:32 GMT+0800 (CST)" -> "CST" + // e.g. "Thu Nov 16 2017 01:13:32 GMT+0800 (CST)" -> "CST" return new Date().toString().match(/\((.*)\)/)[1]; } } diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala index e445f188e1eed..61ab63584269b 100644 --- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala +++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala @@ -248,7 +248,7 @@ private[spark] class ExecutorAllocationManager( executor.scheduleWithFixedDelay(scheduleTask, 0, intervalMillis, TimeUnit.MILLISECONDS) } - // copy the maps inside synchonize to ensure not being modified + // copy the maps inside synchronize to ensure not being modified val (numExecutorsTarget, numLocalityAware) = synchronized { val numTarget = numExecutorsTargetPerResourceProfileId.toMap val numLocality = numLocalityAwareTasksPerResourceProfileId.toMap @@ -379,7 +379,7 @@ private[spark] class ExecutorAllocationManager( // We lower the target number of executors but don't actively kill any yet. Killing is // controlled separately by an idle timeout. It's still helpful to reduce - // the target number in case an executor just happens to get lost (eg., bad hardware, + // the target number in case an executor just happens to get lost (e.g., bad hardware, // or the cluster manager preempts it) -- in that case, there is no point in trying // to immediately get a new executor, since we wouldn't even use it yet. decrementExecutorsFromTarget(maxNeeded, rpId, updatesNeeded) diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala index 1bcd203f2e435..6dd36309378cc 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala @@ -941,7 +941,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) /** * Return a RDD containing only the elements in the inclusive range `lower` to `upper`. * If the RDD has been partitioned using a `RangePartitioner`, then this operation can be - * performed efficiently by only scanning the partitions that might containt matching elements. + * performed efficiently by only scanning the partitions that might contain matching elements. * Otherwise, a standard `filter` is applied to all partitions. * * @since 3.1.0 @@ -955,7 +955,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) /** * Return a RDD containing only the elements in the inclusive range `lower` to `upper`. * If the RDD has been partitioned using a `RangePartitioner`, then this operation can be - * performed efficiently by only scanning the partitions that might containt matching elements. + * performed efficiently by only scanning the partitions that might contain matching elements. * Otherwise, a standard `filter` is applied to all partitions. * * @since 3.1.0 diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala index 89b33945dfb08..306af24ada584 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala @@ -78,7 +78,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { /** * Internal method to this RDD; will read from cache if applicable, or otherwise compute it. - * This should ''not'' be called by users directly, but is available for implementors of custom + * This should ''not'' be called by users directly, but is available for implementers of custom * subclasses of RDD. */ def iterator(split: Partition, taskContext: TaskContext): JIterator[T] = diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index 86a1ac31c0845..6d4dc3d3dfe92 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -48,14 +48,14 @@ import org.apache.spark.util._ private[spark] class PythonRDD( parent: RDD[_], func: PythonFunction, - preservePartitoning: Boolean, + preservePartitioning: Boolean, isFromBarrier: Boolean = false) extends RDD[Array[Byte]](parent) { override def getPartitions: Array[Partition] = firstParent.partitions override val partitioner: Option[Partitioner] = { - if (preservePartitoning) firstParent.partitioner else None + if (preservePartitioning) firstParent.partitioner else None } val asJavaRDD: JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this) @@ -837,7 +837,7 @@ private[spark] class PythonBroadcast(@transient var path: String) extends Serial * We might be serializing a really large object from python -- we don't want * python to buffer the whole thing in memory, nor can it write to a file, * so we don't know the length in advance. So python writes it in chunks, each chunk - * preceeded by a length, till we get a "length" of -1 which serves as EOF. + * preceded by a length, till we get a "length" of -1 which serves as EOF. * * Tested from python tests. */ diff --git a/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala index d76fb7f9a20b3..f697892aacc83 100644 --- a/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala +++ b/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala @@ -80,7 +80,7 @@ private[deploy] object JsonProtocol { } /** - * Export the [[ApplicationInfo]] to a Json objec. An [[ApplicationInfo]] consists of the + * Export the [[ApplicationInfo]] to a Json object. An [[ApplicationInfo]] consists of the * information of an application. * * @return a Json object containing the following fields: diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 7332c6d54c981..4aa393c514af6 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -311,7 +311,7 @@ private[spark] class SparkSubmit extends Logging { // In K8s client mode, when in the driver, add resolved jars early as we might need // them at the submit time for artifact downloading. // For example we might use the dependencies for downloading - // files from a Hadoop Compatible fs eg. S3. In this case the user might pass: + // files from a Hadoop Compatible fs e.g. S3. In this case the user might pass: // --packages com.amazonaws:aws-java-sdk:1.7.4:org.apache.hadoop:hadoop-aws:2.7.6 if (isKubernetesClusterModeDriver) { val loader = getSubmitClassLoader(sparkConf) diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index e5341aff8ce66..e6df260bdeaa3 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -722,7 +722,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) /** * Replay the given log file, saving the application in the listing db. - * Visable for testing + * Visible for testing */ private[history] def doMergeApplicationListing( reader: EventLogFileReader, diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HybridStore.scala b/core/src/main/scala/org/apache/spark/deploy/history/HybridStore.scala index 1b8c7ff26e9f5..4eb5c15d4ed18 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HybridStore.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HybridStore.scala @@ -52,7 +52,7 @@ private[history] class HybridStore extends KVStore { // A background thread that dumps data from inMemoryStore to levelDB private var backgroundThread: Thread = null - // A hash map that stores all classes that had been writen to inMemoryStore + // A hash map that stores all classes that had been written to inMemoryStore // Visible for testing private[history] val klassMap = new ConcurrentHashMap[Class[_], Boolean] diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index efb0b2c26d9a9..c81ac778a32d1 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -685,7 +685,7 @@ private[spark] class Executor( // SPARK-20904: Do not report failure to driver if if happened during shut down. Because // libraries may set up shutdown hooks that race with running tasks during shutdown, // spurious failures may occur and can result in improper accounting in the driver (e.g. - // the task failure would not be ignored if the shutdown happened because of premption, + // the task failure would not be ignored if the shutdown happened because of preemption, // instead of an app issue). if (!ShutdownHookManager.inShutdown()) { val (accums, accUpdates) = collectAccumulatorsAndResetStatusOnFailure(taskStartTimeNs) @@ -744,7 +744,7 @@ private[spark] class Executor( * sending a Thread.interrupt(), and monitoring the task until it finishes. * * Spark's current task cancellation / task killing mechanism is "best effort" because some tasks - * may not be interruptable or may not respond to their "killed" flags being set. If a significant + * may not be interruptible or may not respond to their "killed" flags being set. If a significant * fraction of a cluster's task slots are occupied by tasks that have been marked as killed but * remain running then this can lead to a situation where new jobs and tasks are starved of * resources that are being used by these zombie tasks. diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala index d98d5e3b81aa0..bddd18adc683e 100644 --- a/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala +++ b/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala @@ -102,7 +102,7 @@ private[spark] class MetricsConfig(conf: SparkConf) extends Logging { * * @param prop the flat list of properties to "unflatten" based on prefixes * @param regex the regex that the prefix has to comply with - * @return an unflatted map, mapping prefix with sub-properties under that prefix + * @return an unflattened map, mapping prefix with sub-properties under that prefix */ def subProperties(prop: Properties, regex: Regex): mutable.HashMap[String, Properties] = { val subProperties = new mutable.HashMap[String, Properties] diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/PrometheusServlet.scala b/core/src/main/scala/org/apache/spark/metrics/sink/PrometheusServlet.scala index 59b863b89f75a..e9c2974622300 100644 --- a/core/src/main/scala/org/apache/spark/metrics/sink/PrometheusServlet.scala +++ b/core/src/main/scala/org/apache/spark/metrics/sink/PrometheusServlet.scala @@ -56,7 +56,7 @@ private[spark] class PrometheusServlet( def getMetricsSnapshot(request: HttpServletRequest): String = { import scala.collection.JavaConverters._ - val guagesLabel = """{type="gauges"}""" + val gaugesLabel = """{type="gauges"}""" val countersLabel = """{type="counters"}""" val metersLabel = countersLabel val histogramslabels = """{type="histograms"}""" @@ -65,8 +65,8 @@ private[spark] class PrometheusServlet( val sb = new StringBuilder() registry.getGauges.asScala.foreach { case (k, v) => if (!v.getValue.isInstanceOf[String]) { - sb.append(s"${normalizeKey(k)}Number$guagesLabel ${v.getValue}\n") - sb.append(s"${normalizeKey(k)}Value$guagesLabel ${v.getValue}\n") + sb.append(s"${normalizeKey(k)}Number$gaugesLabel ${v.getValue}\n") + sb.append(s"${normalizeKey(k)}Value$gaugesLabel ${v.getValue}\n") } } registry.getCounters.asScala.foreach { case (k, v) => diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala index 943abae17a911..39f69567981ea 100644 --- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala @@ -173,7 +173,7 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable { if (buckets.length < 2) { throw new IllegalArgumentException("buckets array must have at least two elements") } - // The histogramPartition function computes the partail histogram for a given + // The histogramPartition function computes the partial histogram for a given // partition. The provided bucketFunction determines which bucket in the array // to increment or returns None if there is no bucket. This is done so we can // specialize for uniformly distributed buckets and save the O(log n) binary diff --git a/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala index 5b1c024257529..3cefcb16d6eb1 100644 --- a/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/OrderedRDDFunctions.scala @@ -88,10 +88,10 @@ class OrderedRDDFunctions[K : Ordering : ClassTag, val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => - val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { + val partitionIndices = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } - PartitionPruningRDD.create(self, partitionIndicies.contains) + PartitionPruningRDD.create(self, partitionIndices.contains) case _ => self } diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 15b00a4496da6..65b39c4b65603 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -327,7 +327,7 @@ abstract class RDD[T: ClassTag]( /** * Internal method to this RDD; will read from cache if applicable, or otherwise compute it. - * This should ''not'' be called by users directly, but is available for implementors of custom + * This should ''not'' be called by users directly, but is available for implementers of custom * subclasses of RDD. */ final def iterator(split: Partition, context: TaskContext): Iterator[T] = { diff --git a/core/src/main/scala/org/apache/spark/resource/TaskResourceRequest.scala b/core/src/main/scala/org/apache/spark/resource/TaskResourceRequest.scala index d3f979fa8672f..12ef34241f9cb 100644 --- a/core/src/main/scala/org/apache/spark/resource/TaskResourceRequest.scala +++ b/core/src/main/scala/org/apache/spark/resource/TaskResourceRequest.scala @@ -20,7 +20,7 @@ package org.apache.spark.resource import org.apache.spark.annotation.{Evolving, Since} /** - * A task resource request. This is used in conjuntion with the ResourceProfile to + * A task resource request. This is used in conjunction with the ResourceProfile to * programmatically specify the resources needed for an RDD that will be applied at the * stage level. * diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala index fcb9fe422c0d4..5864e9e2ceac0 100644 --- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala +++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala @@ -254,14 +254,14 @@ private[netty] class NettyRpcEnv( val timeoutCancelable = timeoutScheduler.schedule(new Runnable { override def run(): Unit = { - val remoteReceAddr = if (remoteAddr == null) { + val remoteRecAddr = if (remoteAddr == null) { Try { message.receiver.client.getChannel.remoteAddress() }.toOption.orNull } else { remoteAddr } - onFailure(new TimeoutException(s"Cannot receive any reply from ${remoteReceAddr} " + + onFailure(new TimeoutException(s"Cannot receive any reply from ${remoteRecAddr} " + s"in ${timeout.duration}")) } }, timeout.duration.toNanos, TimeUnit.NANOSECONDS) diff --git a/core/src/main/scala/org/apache/spark/scheduler/BarrierJobAllocationFailed.scala b/core/src/main/scala/org/apache/spark/scheduler/BarrierJobAllocationFailed.scala index 043c6b90384b4..8f0764ed1a61e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/BarrierJobAllocationFailed.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/BarrierJobAllocationFailed.scala @@ -45,10 +45,10 @@ private[spark] object BarrierJobAllocationFailed { val ERROR_MESSAGE_RUN_BARRIER_WITH_UNSUPPORTED_RDD_CHAIN_PATTERN = "[SPARK-24820][SPARK-24821]: Barrier execution mode does not allow the following pattern of " + "RDD chain within a barrier stage:\n1. Ancestor RDDs that have different number of " + - "partitions from the resulting RDD (eg. union()/coalesce()/first()/take()/" + + "partitions from the resulting RDD (e.g. union()/coalesce()/first()/take()/" + "PartitionPruningRDD). A workaround for first()/take() can be barrierRdd.collect().head " + "(scala) or barrierRdd.collect()[0] (python).\n" + - "2. An RDD that depends on multiple barrier RDDs (eg. barrierRdd1.zip(barrierRdd2))." + "2. An RDD that depends on multiple barrier RDDs (e.g. barrierRdd1.zip(barrierRdd2))." // Error message when running a barrier stage with dynamic resource allocation enabled. val ERROR_MESSAGE_RUN_BARRIER_WITH_DYN_ALLOCATION = diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 6fb0fb93f253b..02f5bb8cccd52 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -409,9 +409,9 @@ private[spark] class DAGScheduler( /** * Check to make sure we don't launch a barrier stage with unsupported RDD chain pattern. The * following patterns are not supported: - * 1. Ancestor RDDs that have different number of partitions from the resulting RDD (eg. + * 1. Ancestor RDDs that have different number of partitions from the resulting RDD (e.g. * union()/coalesce()/first()/take()/PartitionPruningRDD); - * 2. An RDD that depends on multiple barrier RDDs (eg. barrierRdd1.zip(barrierRdd2)). + * 2. An RDD that depends on multiple barrier RDDs (e.g. barrierRdd1.zip(barrierRdd2)). */ private def checkBarrierStageWithRDDChainPattern(rdd: RDD[_], numTasksInStage: Int): Unit = { if (rdd.isBarrier() && @@ -459,7 +459,7 @@ private[spark] class DAGScheduler( /** * We don't support run a barrier stage with dynamic resource allocation enabled, it shall lead - * to some confusing behaviors (eg. with dynamic resource allocation enabled, it may happen that + * to some confusing behaviors (e.g. with dynamic resource allocation enabled, it may happen that * we acquire some executors (but not enough to launch all the tasks in a barrier stage) and * later release them due to executor idle time expire, and then acquire again). * @@ -1555,7 +1555,7 @@ private[spark] class DAGScheduler( event.reason) if (!stageIdToStage.contains(task.stageId)) { - // The stage may have already finished when we get this event -- eg. maybe it was a + // The stage may have already finished when we get this event -- e.g. maybe it was a // speculative task. It is important that we send the TaskEnd event in any case, so listeners // are properly notified and can chose to handle it. For instance, some listeners are // doing their own accounting and if they don't get the task end event they think diff --git a/core/src/main/scala/org/apache/spark/scheduler/HealthTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/HealthTracker.scala index 9bbacea94bf68..c6b8dca3597ba 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/HealthTracker.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/HealthTracker.scala @@ -32,7 +32,7 @@ import org.apache.spark.util.{Clock, SystemClock, Utils} * additional logic for exclusion of executors and nodes for individual tasks and stages which * works in concert with the logic here. * - * The tracker needs to deal with a variety of workloads, eg.: + * The tracker needs to deal with a variety of workloads, e.g.: * * * bad user code -- this may lead to many task failures, but that should not count against * individual executors @@ -362,7 +362,7 @@ private[scheduler] class HealthTracker ( * Apply the timeout to individual tasks. This is to prevent one-off failures that are very * spread out in time (and likely have nothing to do with problems on the executor) from * triggering exlusion. However, note that we do *not* remove executors and nodes from - * being excluded as we expire individual task failures -- each have their own timeout. Eg., + * being excluded as we expire individual task failures -- each have their own timeout. E.g., * suppose: * * timeout = 10, maxFailuresPerExec = 2 * * Task 1 fails on exec 1 at time 0 diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala index 0cfa76583bfbb..914fccc1a67cd 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala @@ -216,7 +216,7 @@ private[spark] class TaskSetManager( /** * Track the set of locality levels which are valid given the tasks locality preferences and * the set of currently available executors. This is updated as executors are added and removed. - * This allows a performance optimization, of skipping levels that aren't relevant (eg., skip + * This allows a performance optimization, of skipping levels that aren't relevant (e.g., skip * PROCESS_LOCAL if no tasks could be run PROCESS_LOCAL for the current set of executors). */ private[scheduler] var myLocalityLevels = computeValidLocalityLevels() diff --git a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala index a4df0d543ecbe..4ebb7b0defd7f 100644 --- a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala +++ b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala @@ -167,7 +167,7 @@ private[spark] object CryptoStreamUtils extends Logging { } /** - * SPARK-25535. The commons-cryto library will throw InternalError if something goes + * SPARK-25535. The commons-crypto library will throw InternalError if something goes * wrong, and leave bad state behind in the Java wrappers, so it's not safe to use them * afterwards. This wrapper detects that situation and avoids further calls into the * commons-crypto code, while still allowing the underlying streams to be closed. diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index 924601f92c5b8..072702b343328 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -1103,7 +1103,7 @@ private[spark] class BlockManager( blockSize: Long): Option[ManagedBuffer] = { val file = ExecutorDiskUtils.getFile(localDirs, subDirsPerLocalDir, blockId.name) if (file.exists()) { - val mangedBuffer = securityManager.getIOEncryptionKey() match { + val managedBuffer = securityManager.getIOEncryptionKey() match { case Some(key) => // Encrypted blocks cannot be memory mapped; return a special object that does decryption // and provides InputStream / FileRegion implementations for reading the data. @@ -1114,7 +1114,7 @@ private[spark] class BlockManager( val transportConf = SparkTransportConf.fromSparkConf(conf, "shuffle") new FileSegmentManagedBuffer(transportConf, file, 0, file.length) } - Some(mangedBuffer) + Some(managedBuffer) } else { None } diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala index 4d565511704d4..eada4b3ee2e38 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala @@ -357,7 +357,7 @@ class BlockManagerMasterEndpoint( blockLocations.remove(blockId) logWarning(s"No more replicas available for $blockId !") } else if (proactivelyReplicate && (blockId.isRDD || blockId.isInstanceOf[TestBlockId])) { - // As a heursitic, assume single executor failure to find out the number of replicas that + // As a heuristic, assume single executor failure to find out the number of replicas that // existed before failure val maxReplicas = locations.size + 1 val i = (new Random(blockId.hashCode)).nextInt(locations.size) diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala index 5f5a08fe0e574..cfe15eb832273 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala @@ -85,7 +85,7 @@ private[ui] class AllJobsPage(parent: JobsTab, store: AppStatusStore) extends We } // The timeline library treats contents as HTML, so we have to escape them. We need to add - // extra layers of escaping in order to embed this in a Javascript string literal. + // extra layers of escaping in order to embed this in a JavaScript string literal. val escapedDesc = Utility.escape(jobDescription) val jsEscapedDescForTooltip = StringEscapeUtils.escapeEcmaScript(Utility.escape(escapedDesc)) val jsEscapedDescForLabel = StringEscapeUtils.escapeEcmaScript(escapedDesc) diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala index 19eccc5209b8e..c40e1bc248a49 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala @@ -68,7 +68,7 @@ private[ui] class JobPage(parent: JobsTab, store: AppStatusStore) extends WebUIP .getOrElse(System.currentTimeMillis()) // The timeline library treats contents as HTML, so we have to escape them. We need to add - // extra layers of escaping in order to embed this in a Javascript string literal. + // extra layers of escaping in order to embed this in a JavaScript string literal. val escapedName = Utility.escape(name) val jsEscapedNameForTooltip = StringEscapeUtils.escapeEcmaScript(Utility.escape(escapedName)) val jsEscapedNameForLabel = StringEscapeUtils.escapeEcmaScript(escapedName) diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala index 6ffd6605f75b8..7e2b9c72ad91b 100644 --- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala +++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala @@ -285,7 +285,7 @@ private[spark] object ClosureCleaner extends Logging { logDebug(s" + outermost object is a closure, so we clone it: ${outermostClass}") } else if (outermostClass.getName.startsWith("$line")) { // SPARK-14558: if the outermost object is a REPL line object, we should clone - // and clean it as it may carray a lot of unnecessary information, + // and clean it as it may carry a lot of unnecessary information, // e.g. hadoop conf, spark conf, etc. logDebug(s" + outermost object is a REPL line object, so we clone it:" + s" ${outermostClass}") diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 71a310a4279ad..accf3d7c0d333 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -376,7 +376,7 @@ private[spark] object Utils extends Logging { * This returns a new InputStream which contains the same data as the original input stream. * It may be entirely on in-memory buffer, or it may be a combination of in-memory data, and then * continue to read from the original stream. The only real use of this is if the original input - * stream will potentially detect corruption while the data is being read (eg. from compression). + * stream will potentially detect corruption while the data is being read (e.g. from compression). * This allows for an eager check of corruption in the first maxSize bytes of data. * * @return An InputStream which includes all data from the original stream (combining buffered @@ -1067,20 +1067,20 @@ private[spark] object Utils extends Logging { } // checks if the hostport contains IPV6 ip and parses the host, port if (hostPort != null && hostPort.split(":").length > 2) { - val indx: Int = hostPort.lastIndexOf("]:") - if (-1 == indx) { + val index: Int = hostPort.lastIndexOf("]:") + if (-1 == index) { return setDefaultPortValue } - val port = hostPort.substring(indx + 2).trim() - val retval = (hostPort.substring(0, indx + 1).trim(), if (port.isEmpty) 0 else port.toInt) + val port = hostPort.substring(index + 2).trim() + val retval = (hostPort.substring(0, index + 1).trim(), if (port.isEmpty) 0 else port.toInt) hostPortParseResults.putIfAbsent(hostPort, retval) } else { - val indx: Int = hostPort.lastIndexOf(':') - if (-1 == indx) { + val index: Int = hostPort.lastIndexOf(':') + if (-1 == index) { return setDefaultPortValue } - val port = hostPort.substring(indx + 1).trim() - val retval = (hostPort.substring(0, indx).trim(), if (port.isEmpty) 0 else port.toInt) + val port = hostPort.substring(index + 1).trim() + val retval = (hostPort.substring(0, index).trim(), if (port.isEmpty) 0 else port.toInt) hostPortParseResults.putIfAbsent(hostPort, retval) } @@ -2854,11 +2854,11 @@ private[spark] object Utils extends Logging { if (lastDollarIndex < s.length - 1) { // The last char is not a dollar sign if (lastDollarIndex == -1 || !s.contains("$iw")) { - // The name does not have dollar sign or is not an intepreter + // The name does not have dollar sign or is not an interpreter // generated class, so we should return the full string s } else { - // The class name is intepreter generated, + // The class name is interpreter generated, // return the part after the last dollar sign // This is the same behavior as getClass.getSimpleName s.substring(lastDollarIndex + 1) diff --git a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala index 2c3730de08b5b..8635f1a3d702e 100644 --- a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala +++ b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala @@ -193,7 +193,7 @@ private[spark] object ChunkedByteBuffer { length: Long): ChunkedByteBuffer = { // We do *not* memory map the file, because we may end up putting this into the memory store, // and spark currently is not expecting memory-mapped buffers in the memory store, it conflicts - // with other parts that manage the lifecyle of buffers and dispose them. See SPARK-25422. + // with other parts that manage the lifecycle of buffers and dispose them. See SPARK-25422. val is = new FileInputStream(file) ByteStreams.skipFully(is, offset) val in = new LimitedInputStream(is, length) diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java index ee8e38c24b47f..df1d306e628a9 100644 --- a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java +++ b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java @@ -68,10 +68,10 @@ public class UnsafeShuffleWriterSuite { static final int DEFAULT_INITIAL_SORT_BUFFER_SIZE = 4096; - static final int NUM_PARTITITONS = 4; + static final int NUM_PARTITIONS = 4; TestMemoryManager memoryManager; TaskMemoryManager taskMemoryManager; - final HashPartitioner hashPartitioner = new HashPartitioner(NUM_PARTITITONS); + final HashPartitioner hashPartitioner = new HashPartitioner(NUM_PARTITIONS); File mergedOutputFile; File tempDir; long[] partitionSizesInMergedFile; @@ -194,7 +194,7 @@ private void assertSpillFilesWereCleanedUp() { private List> readRecordsFromFile() throws IOException { final ArrayList> recordsList = new ArrayList<>(); long startOffset = 0; - for (int i = 0; i < NUM_PARTITITONS; i++) { + for (int i = 0; i < NUM_PARTITIONS; i++) { final long partitionSize = partitionSizesInMergedFile[i]; if (partitionSize > 0) { FileInputStream fin = new FileInputStream(mergedOutputFile); @@ -253,7 +253,7 @@ public void writeEmptyIterator() throws Exception { assertTrue(mapStatus.isDefined()); assertTrue(mergedOutputFile.exists()); assertEquals(0, spillFilesCreated.size()); - assertArrayEquals(new long[NUM_PARTITITONS], partitionSizesInMergedFile); + assertArrayEquals(new long[NUM_PARTITIONS], partitionSizesInMergedFile); assertEquals(0, taskMetrics.shuffleWriteMetrics().recordsWritten()); assertEquals(0, taskMetrics.shuffleWriteMetrics().bytesWritten()); assertEquals(0, taskMetrics.diskBytesSpilled()); @@ -264,7 +264,7 @@ public void writeEmptyIterator() throws Exception { public void writeWithoutSpilling() throws Exception { // In this example, each partition should have exactly one record: final ArrayList> dataToWrite = new ArrayList<>(); - for (int i = 0; i < NUM_PARTITITONS; i++) { + for (int i = 0; i < NUM_PARTITIONS; i++) { dataToWrite.add(new Tuple2<>(i, i)); } final UnsafeShuffleWriter writer = createWriter(true); diff --git a/core/src/test/java/test/org/apache/spark/JavaAPISuite.java b/core/src/test/java/test/org/apache/spark/JavaAPISuite.java index dbaca71c5fdc3..e73ac0e9fb7a6 100644 --- a/core/src/test/java/test/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/java/test/org/apache/spark/JavaAPISuite.java @@ -1518,7 +1518,7 @@ public void testAsyncActionErrorWrapping() throws Exception { JavaFutureAction future = rdd.map(new BuggyMapFunction<>()).countAsync(); try { future.get(2, TimeUnit.SECONDS); - fail("Expected future.get() for failed job to throw ExcecutionException"); + fail("Expected future.get() for failed job to throw ExecutionException"); } catch (ExecutionException ee) { assertTrue(Throwables.getStackTraceAsString(ee).contains("Custom exception!")); } diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala index 21090e98ea285..e42df0821589b 100644 --- a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala +++ b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala @@ -635,12 +635,12 @@ class CheckpointStorageSuite extends SparkFunSuite with LocalSparkContext { // Verify that RDD is checkpointed assert(rdd.firstParent.isInstanceOf[ReliableCheckpointRDD[_]]) val checkpointedRDD = rdd.firstParent.asInstanceOf[ReliableCheckpointRDD[_]] - val partiton = checkpointedRDD.partitions(0) - assert(!checkpointedRDD.cachedPreferredLocations.asMap.containsKey(partiton)) + val partition = checkpointedRDD.partitions(0) + assert(!checkpointedRDD.cachedPreferredLocations.asMap.containsKey(partition)) - val preferredLoc = checkpointedRDD.preferredLocations(partiton) - assert(checkpointedRDD.cachedPreferredLocations.asMap.containsKey(partiton)) - assert(preferredLoc == checkpointedRDD.cachedPreferredLocations.get(partiton)) + val preferredLoc = checkpointedRDD.preferredLocations(partition) + assert(checkpointedRDD.cachedPreferredLocations.asMap.containsKey(partition)) + assert(preferredLoc == checkpointedRDD.cachedPreferredLocations.get(partition)) } } @@ -653,7 +653,7 @@ class CheckpointStorageSuite extends SparkFunSuite with LocalSparkContext { val rdd = sc.makeRDD(1 to 200, numSlices = 4).repartition(1).mapPartitions { iter => iter.map { i => if (i > 100 && TaskContext.get().stageAttemptNumber() == 0) { - // throw new SparkException("Make first attemp failed.") + // throw new SparkException("Make first attempt failed.") // Throw FetchFailedException to explicitly trigger stage resubmission. // A normal exception will only trigger task resubmission in the same stage. throw new FetchFailedException(null, 0, 0L, 0, 0, "Fake") diff --git a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala index 81530a8fda84d..5434e82c95b1b 100644 --- a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala +++ b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala @@ -368,7 +368,7 @@ class CleanerTester( val toBeCleanedRDDIds = new HashSet[Int] ++= rddIds val toBeCleanedShuffleIds = new HashSet[Int] ++= shuffleIds - val toBeCleanedBroadcstIds = new HashSet[Long] ++= broadcastIds + val toBeCleanedBroadcastIds = new HashSet[Long] ++= broadcastIds val toBeCheckpointIds = new HashSet[Long] ++= checkpointIds val isDistributed = !sc.isLocal @@ -384,7 +384,7 @@ class CleanerTester( } def broadcastCleaned(broadcastId: Long): Unit = { - toBeCleanedBroadcstIds.synchronized { toBeCleanedBroadcstIds -= broadcastId } + toBeCleanedBroadcastIds.synchronized { toBeCleanedBroadcastIds -= broadcastId } logInfo("Broadcast " + broadcastId + " cleaned") } @@ -508,8 +508,8 @@ class CleanerTester( val s2 = toBeCleanedShuffleIds.synchronized { toBeCleanedShuffleIds.toSeq.sorted.mkString("[", ", ", "]") } - val s3 = toBeCleanedBroadcstIds.synchronized { - toBeCleanedBroadcstIds.toSeq.sorted.mkString("[", ", ", "]") + val s3 = toBeCleanedBroadcastIds.synchronized { + toBeCleanedBroadcastIds.toSeq.sorted.mkString("[", ", ", "]") } s""" |\tRDDs = $s1 @@ -521,7 +521,7 @@ class CleanerTester( private def isAllCleanedUp = toBeCleanedRDDIds.synchronized { toBeCleanedRDDIds.isEmpty } && toBeCleanedShuffleIds.synchronized { toBeCleanedShuffleIds.isEmpty } && - toBeCleanedBroadcstIds.synchronized { toBeCleanedBroadcstIds.isEmpty } && + toBeCleanedBroadcastIds.synchronized { toBeCleanedBroadcastIds.isEmpty } && toBeCheckpointIds.synchronized { toBeCheckpointIds.isEmpty } private def getRDDBlocks(rddId: Int): Seq[BlockId] = { diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala index d1edb80e40b21..c1269a9c91049 100644 --- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala @@ -268,7 +268,7 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { test("add executors multiple profiles initial num same as needed") { // test when the initial number of executors equals the number needed for the first - // stage using a non default profile to make sure we request the intitial number + // stage using a non default profile to make sure we request the initial number // properly. Here initial is 2, each executor in ResourceProfile 1 can have 2 tasks // per executor, and start a stage with 4 tasks, which would need 2 executors. val clock = new ManualClock(8888L) diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala index e9ee6b5dfb665..f953bf4043f33 100644 --- a/core/src/test/scala/org/apache/spark/FileSuite.scala +++ b/core/src/test/scala/org/apache/spark/FileSuite.scala @@ -170,7 +170,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { val nums = sc.makeRDD(1 to 3).map(x => (x, "a" * x)) // (1,a), (2,aa), (3,aaa) nums.saveAsSequenceFile(outputDir) // Similar to the tests above, we read a SequenceFile, but this time we pass type params - // that are convertable to Writable instead of calling sequenceFile[IntWritable, Text] + // that are convertible to Writable instead of calling sequenceFile[IntWritable, Text] val output1 = sc.sequenceFile[Int, String](outputDir) assert(output1.collect().toList === List((1, "a"), (2, "aa"), (3, "aaa"))) // Also try having one type be a subclass of Writable and one not diff --git a/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala b/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala index e97b9d5d6bea6..eff4fd20d7fca 100644 --- a/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala +++ b/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala @@ -21,7 +21,7 @@ import java.io.{File, FileOutputStream, OutputStream} /** * A base class for generate benchmark results to a file. - * For JDK9+, JDK major version number is added to the file names to distingush the results. + * For JDK9+, JDK major version number is added to the file names to distinguish the results. */ abstract class BenchmarkBase { var output: Option[OutputStream] = None diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala index 0b0754be2f56f..3b8677742ca16 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala @@ -926,8 +926,8 @@ class FsHistoryProviderSuite extends SparkFunSuite with Matchers with Logging { oldProvider.listing.setMetadata(meta) oldProvider.stop() - val mistatchedVersionProvider = new FsHistoryProvider(conf) - assert(mistatchedVersionProvider.listing.count(classOf[ApplicationInfoWrapper]) === 0) + val mismatchedVersionProvider = new FsHistoryProvider(conf) + assert(mismatchedVersionProvider.listing.count(classOf[ApplicationInfoWrapper]) === 0) } test("invalidate cached UI") { diff --git a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala index a46799df069d6..b1b97a61ed1f0 100644 --- a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala @@ -784,7 +784,7 @@ class MasterSuite extends SparkFunSuite var worker: MockExecutorLaunchFailWorker = null try { val conf = new SparkConf() - // SPARK-32250: When running test on Github Action machine, the available processors in JVM + // SPARK-32250: When running test on GitHub Action machine, the available processors in JVM // is only 2, while on Jenkins it's 32. For this specific test, 2 available processors, which // also decides number of threads in Dispatcher, is not enough to consume the messages. In // the worst situation, MockExecutorLaunchFailWorker would occupy these 2 threads for diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala index 5bbd60f99f77e..8ed861ad34ea7 100644 --- a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerSuite.scala @@ -342,7 +342,7 @@ class WorkerSuite extends SparkFunSuite with Matchers with BeforeAndAfter { testWorkDirCleanupAndRemoveMetadataWithConfig(true) } - test("WorkdDirCleanup cleans only app dirs when" + + test("WorkDirCleanup cleans only app dirs when" + "spark.shuffle.service.db.enabled=false") { testWorkDirCleanupAndRemoveMetadataWithConfig(false) } diff --git a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala index 1326ae3c11a06..5b868604ecf94 100644 --- a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala +++ b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala @@ -552,7 +552,7 @@ class ExecutorSuite extends SparkFunSuite if (poll) { executor.metricsPoller.poll() } - executor.killAllTasks(true, "Killed task, eg. because of speculative execution") + executor.killAllTasks(true, "Killed task, e.g. because of speculative execution") } else { timedOut.set(true) } diff --git a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala index 2bd32fc927e21..778f748f83950 100644 --- a/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/io/FileCommitProtocolInstantiationSuite.scala @@ -75,7 +75,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { /** * Create a classic two-arg protocol instance. - * @param dynamic dyanmic partitioning mode + * @param dynamic dynamic partitioning mode * @return the instance */ private def instantiateClassic(dynamic: Boolean): ClassicConstructorCommitProtocol = { @@ -88,7 +88,7 @@ class FileCommitProtocolInstantiationSuite extends SparkFunSuite { /** * Create a three-arg protocol instance. - * @param dynamic dyanmic partitioning mode + * @param dynamic dynamic partitioning mode * @return the instance */ private def instantiateNew( diff --git a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala index 330347299ab56..905bb8110736d 100644 --- a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala +++ b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala @@ -213,7 +213,7 @@ class InputOutputMetricsSuite extends SparkFunSuite with SharedSparkContext } // Computing the amount of bytes read for a cartesian operation is a little involved. - // Cartesian interleaves reads between two partitions eg. p1 and p2. + // Cartesian interleaves reads between two partitions e.g. p1 and p2. // Here are the steps: // 1) First it creates an iterator for p1 // 2) Creates an iterator for p2 diff --git a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala index 182c3c09e0524..c8a8f37212a82 100644 --- a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala +++ b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala @@ -88,7 +88,7 @@ class NettyBlockTransferServiceSuite } test("SPARK-27637: test fetch block with executor dead") { - implicit val exectionContext = ExecutionContext.global + implicit val executionContext = ExecutionContext.global val port = 17634 + Random.nextInt(10000) logInfo("random port for test: " + port) diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala index 2de4b109e40e9..a669993352fe7 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala @@ -28,7 +28,7 @@ import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.mapred._ import org.apache.hadoop.mapreduce.{Job => NewJob, JobContext => NewJobContext, OutputCommitter => NewOutputCommitter, OutputFormat => NewOutputFormat, - RecordWriter => NewRecordWriter, TaskAttemptContext => NewTaskAttempContext} + RecordWriter => NewRecordWriter, TaskAttemptContext => NewTaskAttemptContext} import org.apache.hadoop.util.Progressable import org.scalatest.Assertions @@ -892,7 +892,7 @@ class FakeOutputFormat() extends OutputFormat[Integer, Integer]() { */ class NewFakeWriter extends NewRecordWriter[Integer, Integer] { - def close(p1: NewTaskAttempContext): Unit = () + def close(p1: NewTaskAttemptContext): Unit = () def write(p1: Integer, p2: Integer): Unit = () @@ -901,24 +901,24 @@ class NewFakeWriter extends NewRecordWriter[Integer, Integer] { class NewFakeCommitter extends NewOutputCommitter { def setupJob(p1: NewJobContext): Unit = () - def needsTaskCommit(p1: NewTaskAttempContext): Boolean = false + def needsTaskCommit(p1: NewTaskAttemptContext): Boolean = false - def setupTask(p1: NewTaskAttempContext): Unit = () + def setupTask(p1: NewTaskAttemptContext): Unit = () - def commitTask(p1: NewTaskAttempContext): Unit = () + def commitTask(p1: NewTaskAttemptContext): Unit = () - def abortTask(p1: NewTaskAttempContext): Unit = () + def abortTask(p1: NewTaskAttemptContext): Unit = () } class NewFakeFormat() extends NewOutputFormat[Integer, Integer]() { def checkOutputSpecs(p1: NewJobContext): Unit = () - def getRecordWriter(p1: NewTaskAttempContext): NewRecordWriter[Integer, Integer] = { + def getRecordWriter(p1: NewTaskAttemptContext): NewRecordWriter[Integer, Integer] = { new NewFakeWriter() } - def getOutputCommitter(p1: NewTaskAttempContext): NewOutputCommitter = { + def getOutputCommitter(p1: NewTaskAttemptContext): NewOutputCommitter = { new NewFakeCommitter() } } @@ -958,7 +958,7 @@ class FakeFormatWithCallback() extends FakeOutputFormat { } class NewFakeWriterWithCallback extends NewFakeWriter { - override def close(p1: NewTaskAttempContext): Unit = { + override def close(p1: NewTaskAttemptContext): Unit = { FakeWriterWithCallback.calledBy += "close" } @@ -972,7 +972,7 @@ class NewFakeWriterWithCallback extends NewFakeWriter { } class NewFakeFormatWithCallback() extends NewFakeFormat { - override def getRecordWriter(p1: NewTaskAttempContext): NewRecordWriter[Integer, Integer] = { + override def getRecordWriter(p1: NewTaskAttemptContext): NewRecordWriter[Integer, Integer] = { new NewFakeWriterWithCallback() } } @@ -982,27 +982,27 @@ class YetAnotherFakeCommitter extends NewOutputCommitter with Assertions { JobID.jobid = j.getJobID().getId } - def needsTaskCommit(t: NewTaskAttempContext): Boolean = false + def needsTaskCommit(t: NewTaskAttemptContext): Boolean = false - def setupTask(t: NewTaskAttempContext): Unit = { + def setupTask(t: NewTaskAttemptContext): Unit = { val jobId = t.getTaskAttemptID().getJobID().getId assert(jobId === JobID.jobid) } - def commitTask(t: NewTaskAttempContext): Unit = {} + def commitTask(t: NewTaskAttemptContext): Unit = {} - def abortTask(t: NewTaskAttempContext): Unit = {} + def abortTask(t: NewTaskAttemptContext): Unit = {} } class YetAnotherFakeFormat() extends NewOutputFormat[Integer, Integer]() { def checkOutputSpecs(j: NewJobContext): Unit = {} - def getRecordWriter(t: NewTaskAttempContext): NewRecordWriter[Integer, Integer] = { + def getRecordWriter(t: NewTaskAttemptContext): NewRecordWriter[Integer, Integer] = { new NewFakeWriter() } - def getOutputCommitter(t: NewTaskAttempContext): NewOutputCommitter = { + def getOutputCommitter(t: NewTaskAttemptContext): NewOutputCommitter = { new YetAnotherFakeCommitter() } } @@ -1021,7 +1021,7 @@ class ConfigTestFormat() extends NewFakeFormat() with Configurable { def getConf: Configuration = null - override def getRecordWriter(p1: NewTaskAttempContext): NewRecordWriter[Integer, Integer] = { + override def getRecordWriter(p1: NewTaskAttemptContext): NewRecordWriter[Integer, Integer] = { assert(setConfCalled, "setConf was never called") super.getRecordWriter(p1) } diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala index 8962fd6740bf6..df8ac2ef744cd 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala @@ -1102,7 +1102,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { } } - test("RDD.partitions() fails fast when partitions indicies are incorrect (SPARK-13021)") { + test("RDD.partitions() fails fast when partitions indices are incorrect (SPARK-13021)") { class BadRDD[T: ClassTag](prev: RDD[T]) extends RDD[T](prev) { override def compute(part: Partition, context: TaskContext): Iterator[T] = { diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala index e8e8682e20ed4..eac45e6ac5801 100644 --- a/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala @@ -221,7 +221,7 @@ class ResourceUtilsSuite extends SparkFunSuite val conf = new SparkConf assume(!(Utils.isWindows)) withTempDir { dir => - val gpuDiscovery = createTempScriptWithExpectedOutput(dir, "gpuDisocveryScript", + val gpuDiscovery = createTempScriptWithExpectedOutput(dir, "gpuDiscoveryScript", """{"name": "gpu", "addresses": ["0", "1"]}""") conf.set(DRIVER_GPU_ID.amountConf, "2") conf.set(DRIVER_GPU_ID.discoveryScriptConf, gpuDiscovery) diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala index c2730f90ed982..fe6d0db837bda 100644 --- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala +++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala @@ -73,7 +73,7 @@ class NettyRpcEnvSuite extends RpcEnvSuite with MockitoSugar with TimeLimits { val nettyEnv = env.asInstanceOf[NettyRpcEnv] val client = mock[TransportClient] - val senderAddress = RpcAddress("locahost", 12345) + val senderAddress = RpcAddress("localhost", 12345) val receiverAddress = RpcEndpointAddress("localhost", 54321, "test") val receiver = new NettyRpcEndpointRef(nettyEnv.conf, receiverAddress, nettyEnv) diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 58aa246b7358f..194e0dfe312d5 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -2569,7 +2569,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti val newTaskSet = taskSets(1) // 2 tasks should have been re-submitted, for tasks 0 and 1 (which ran on hostA). assert(newTaskSet.tasks.size === 2) - // Complete task 0 from the original task set (i.e., not hte one that's currently active). + // Complete task 0 from the original task set (i.e., not the one that's currently active). // This should still be counted towards the job being complete (but there's still one // outstanding task). runEvent(makeCompletionEvent(newTaskSet.tasks(0), Success, makeMapStatus("hostB", 2))) @@ -3057,7 +3057,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti assertResultStageFailToRollback(shuffleMapRdd) } - private def assertResultStageNotRollbacked(mapRdd: MyRDD): Unit = { + private def assertResultStageNotRolledBack(mapRdd: MyRDD): Unit = { val shuffleDep = new ShuffleDependency(mapRdd, new HashPartitioner(2)) val shuffleId = shuffleDep.shuffleId val finalRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker) @@ -3097,7 +3097,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti val shuffleMapRdd = new MyCheckpointRDD(sc, 2, Nil, indeterminate = true) shuffleMapRdd.checkpoint() shuffleMapRdd.doCheckpoint() - assertResultStageNotRollbacked(shuffleMapRdd) + assertResultStageNotRolledBack(shuffleMapRdd) } } diff --git a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala index e6fbf9b09d43d..cb50c7c959754 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala @@ -255,7 +255,7 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp /* * This is a dummy input stream that wraps another input stream but ends prematurely when - * reading at the specified position, throwing an EOFExeption. + * reading at the specified position, throwing an EOFException. */ private class EarlyEOFInputStream(in: InputStream, failAtPos: Int) extends InputStream { private val countDown = new AtomicInteger(failAtPos) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index 0874163b0e946..88d2868b957f9 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -44,7 +44,7 @@ import org.apache.spark.util.{CallSite, ThreadUtils, Utils} * TaskSetManagers. * * Test cases are configured by providing a set of jobs to submit, and then simulating interaction - * with spark's executors via a mocked backend (eg., task completion, task failure, executors + * with spark's executors via a mocked backend (e.g., task completion, task failure, executors * disconnecting, etc.). */ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends SparkFunSuite @@ -372,7 +372,7 @@ private[spark] abstract class MockBackend( /** * Accessed by both scheduling and backend thread, so should be protected by this. - * Most likely the only thing that needs to be protected are the inidividual ExecutorTaskStatus, + * Most likely the only thing that needs to be protected are the individual ExecutorTaskStatus, * but for simplicity in this mock just lock the whole backend. */ def executorIdToExecutor: Map[String, ExecutorTaskStatus] @@ -535,8 +535,8 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor */ testScheduler("super simple job") { def runBackend(): Unit = { - val (taskDescripition, _) = backend.beginTask() - backend.taskSuccess(taskDescripition, 42) + val (taskDescription, _) = backend.beginTask() + backend.taskSuccess(taskDescription, 42) } withBackend(runBackend _) { val jobFuture = submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala index a4a84b0e89809..d72744c5cc348 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala @@ -571,9 +571,9 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match } } - test("event queue size can be configued through spark conf") { + test("event queue size can be configured through spark conf") { // configure the shared queue size to be 1, event log queue size to be 2, - // and listner bus event queue size to be 5 + // and listener bus event queue size to be 5 val conf = new SparkConf(false) .set(LISTENER_BUS_EVENT_QUEUE_CAPACITY, 5) .set(s"spark.scheduler.listenerbus.eventqueue.${SHARED_QUEUE}.capacity", "1") @@ -593,7 +593,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match // check the size of shared queue is 1 as configured assert(bus.getQueueCapacity(SHARED_QUEUE) == Some(1)) // no specific size of status queue is configured, - // it shoud use the LISTENER_BUS_EVENT_QUEUE_CAPACITY + // it should use the LISTENER_BUS_EVENT_QUEUE_CAPACITY assert(bus.getQueueCapacity(APP_STATUS_QUEUE) == Some(5)) // check the size of event log queue is 5 as configured assert(bus.getQueueCapacity(EVENT_LOG_QUEUE) == Some(2)) diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala index a760dda3897df..3bf6cc226c0aa 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala @@ -377,8 +377,8 @@ class TaskSetManagerSuite // offers not accepted due to task set zombies are not delay schedule rejects manager.isZombie = true - val (taskDesciption, delayReject) = manager.resourceOffer("exec2", "host2", ANY) - assert(taskDesciption.isEmpty) + val (taskDescription, delayReject) = manager.resourceOffer("exec2", "host2", ANY) + assert(taskDescription.isEmpty) assert(delayReject === false) manager.isZombie = false @@ -1322,7 +1322,7 @@ class TaskSetManagerSuite test("SPARK-19868: DagScheduler only notified of taskEnd when state is ready") { // dagScheduler.taskEnded() is async, so it may *seem* ok to call it before we've set all - // appropriate state, eg. isZombie. However, this sets up a race that could go the wrong way. + // appropriate state, e.g. isZombie. However, this sets up a race that could go the wrong way. // This is a super-focused regression test which checks the zombie state as soon as // dagScheduler.taskEnded() is called, to ensure we haven't introduced a race. sc = new SparkContext("local", "test") diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala index 6ca1109791c35..a251c164a79ca 100644 --- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala @@ -234,7 +234,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { // Send two executor metrics update. Only update one metric to avoid a lot of boilerplate code. // The tasks are distributed among the two executors, so the executor-level metrics should - // hold half of the cummulative value of the metric being updated. + // hold half of the cumulative value of the metric being updated. Seq(1L, 2L).foreach { value => s1Tasks.foreach { task => val accum = new AccumulableInfo(1L, Some(InternalAccumulator.MEMORY_BYTES_SPILLED), diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala index 144489c5f7922..44b6f1b82e75a 100644 --- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala @@ -1712,12 +1712,12 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE val externalShuffleServicePort = StorageUtils.externalShuffleServicePort(conf) val port = store.blockTransferService.port val rack = Some("rack") - val blockManagerWithTopolgyInfo = BlockManagerId( + val blockManagerWithTopologyInfo = BlockManagerId( store.blockManagerId.executorId, store.blockManagerId.host, store.blockManagerId.port, rack) - store.blockManagerId = blockManagerWithTopolgyInfo + store.blockManagerId = blockManagerWithTopologyInfo val locations = Seq( BlockManagerId("executor4", otherHost, externalShuffleServicePort, rack), BlockManagerId("executor3", otherHost, port, rack), diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala index 4cd1fc19f1484..7640c17166222 100644 --- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala @@ -506,9 +506,9 @@ class JsonProtocolSuite extends SparkFunSuite { val oldExecutorMetricsJson = JsonProtocol.executorMetricsToJson(executorMetrics) .removeField( _._1 == "MappedPoolMemory") - val exepectedExecutorMetrics = new ExecutorMetrics(Array(12L, 23L, 45L, 67L, + val expectedExecutorMetrics = new ExecutorMetrics(Array(12L, 23L, 45L, 67L, 78L, 89L, 90L, 123L, 456L, 0L, 40L, 20L, 20L, 10L, 20L, 10L)) - assertEquals(exepectedExecutorMetrics, + assertEquals(expectedExecutorMetrics, JsonProtocol.executorMetricsFromJson(oldExecutorMetricsJson)) } @@ -978,8 +978,8 @@ private[spark] object JsonProtocolSuite extends Assertions { private val stackTrace = { Array[StackTraceElement]( new StackTraceElement("Apollo", "Venus", "Mercury", 42), - new StackTraceElement("Afollo", "Vemus", "Mercurry", 420), - new StackTraceElement("Ayollo", "Vesus", "Blackberry", 4200) + new StackTraceElement("Afollo", "Vemus", "Mercurry", 420), /* odd spellings intentional */ + new StackTraceElement("Ayollo", "Vesus", "Blackberry", 4200) /* odd spellings intentional */ ) } diff --git a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala index 6183ba9faa6b4..d669f2c655abb 100644 --- a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala @@ -94,7 +94,7 @@ class SizeEstimatorSuite override def beforeEach(): Unit = { super.beforeEach() // Set the arch to 64-bit and compressedOops to true so that SizeEstimator - // provides identical results accross all systems in these tests. + // provides identical results across all systems in these tests. reinitializeSizeEstimator("amd64", "true") } diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb index 6b4b1c652a81b..7d0e78738095e 100644 --- a/docs/_plugins/include_example.rb +++ b/docs/_plugins/include_example.rb @@ -66,10 +66,10 @@ def render(context) rendered_code + hint end - # Trim the code block so as to have the same indention, regardless of their positions in the + # Trim the code block so as to have the same indentation, regardless of their positions in the # code file. def trim_codeblock(lines) - # Select the minimum indention of the current code block. + # Select the minimum indentation of the current code block. min_start_spaces = lines .select { |l| l.strip.size !=0 } .map { |l| l[/\A */].size } diff --git a/docs/building-spark.md b/docs/building-spark.md index 73c527b7a5ed6..5106f2abd4187 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -273,7 +273,7 @@ Enable the profile (e.g. 2.13): # For sbt ./build/sbt -Pscala-2.13 compile -## Running Jenkins tests with Github Enterprise +## Running Jenkins tests with GitHub Enterprise To run tests with Jenkins: diff --git a/docs/configuration.md b/docs/configuration.md index 14ff38dac9b13..76494b04c9279 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -2170,7 +2170,7 @@ Apart from these, the following properties are also available, and may be useful 120s The timeout in seconds to wait to acquire a new executor and schedule a task before aborting a - TaskSet which is unschedulable because all executors are exluded due to task failures. + TaskSet which is unschedulable because all executors are excluded due to task failures. 2.4.1 diff --git a/docs/css/main.css b/docs/css/main.css index 8b279a157c2b6..271113c904d26 100755 --- a/docs/css/main.css +++ b/docs/css/main.css @@ -254,7 +254,7 @@ a:hover code { position: relative; background-color: #FFF; max-width: 914px; - line-height: 1.6; /* Inspired by Github's wiki style */ + line-height: 1.6; /* Inspired by GitHub's wiki style */ padding-left: 15px; } @@ -263,7 +263,7 @@ a:hover code { position: relative; background-color: #FFF; max-width: 914px; - line-height: 1.6; /* Inspired by Github's wiki style */ + line-height: 1.6; /* Inspired by GitHub's wiki style */ padding-left: 30px; min-height: 100vh; } diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md index 50c9366a0999f..a1026669dc4fd 100644 --- a/docs/graphx-programming-guide.md +++ b/docs/graphx-programming-guide.md @@ -571,7 +571,7 @@ messages to the source and destination attributes. Think of `sendMsg` as the reduce function in map-reduce. -The [`aggregateMessages`][Graph.aggregateMessages] operator returns a `VertexRDD[Msg]` +The [`aggregateMessages`][Graph.aggregateMessages] operator returns an `VertexRDD[Msg]` containing the aggregate message (of type `Msg`) destined to each vertex. Vertices that did not receive a message are not included in the returned `VertexRDD`[VertexRDD]. @@ -874,7 +874,7 @@ change the `VertexId` thereby enabling the same `HashMap` data structures to be `HashMap` and implement the join by linear scan rather than costly point lookups. The `aggregateUsingIndex` operator is useful for efficient construction of a new `VertexRDD`[VertexRDD] from an -`RDD[(VertexId, A)]`. Conceptually, if I have constructed a `VertexRDD[B]` over a set of vertices, +`RDD[(VertexId, A)]`. Conceptually, if I have constructed an `VertexRDD[B]` over a set of vertices, *which is a super-set* of the vertices in some `RDD[(VertexId, A)]` then I can reuse the index to both aggregate and then subsequently index the `RDD[(VertexId, A)]`. For example: diff --git a/docs/ml-migration-guide.md b/docs/ml-migration-guide.md index 4e6d68f5a8cf4..43b8de83a9d8c 100644 --- a/docs/ml-migration-guide.md +++ b/docs/ml-migration-guide.md @@ -281,7 +281,7 @@ Several deprecated methods were removed in the `spark.mllib` and `spark.ml` pack * `weights` in `LinearRegression` and `LogisticRegression` in `spark.ml` * `setMaxNumIterations` in `mllib.optimization.LBFGS` (marked as `DeveloperApi`) * `treeReduce` and `treeAggregate` in `mllib.rdd.RDDFunctions` (these functions are available on `RDD`s directly, and were marked as `DeveloperApi`) -* `defaultStategy` in `mllib.tree.configuration.Strategy` +* `defaultStrategy` in `mllib.tree.configuration.Strategy` * `build` in `mllib.tree.Node` * libsvm loaders for multiclass and load/save labeledData methods in `mllib.util.MLUtils` diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md index 4cb2e259ccfbc..cc0c0e39e66f8 100644 --- a/docs/mllib-clustering.md +++ b/docs/mllib-clustering.md @@ -189,7 +189,7 @@ Refer to the [`PowerIterationClustering` Scala docs](api/scala/org/apache/spark/ [`PowerIterationClustering`](api/java/org/apache/spark/mllib/clustering/PowerIterationClustering.html) implements the PIC algorithm. -It takes an `JavaRDD` of `(srcId: Long, dstId: Long, similarity: Double)` tuples representing the +It takes a `JavaRDD` of `(srcId: Long, dstId: Long, similarity: Double)` tuples representing the affinity matrix. Calling `PowerIterationClustering.run` returns a [`PowerIterationClusteringModel`](api/java/org/apache/spark/mllib/clustering/PowerIterationClusteringModel.html) diff --git a/docs/mllib-data-types.md b/docs/mllib-data-types.md index 6d3b1a599d48b..ce4e6b8e05814 100644 --- a/docs/mllib-data-types.md +++ b/docs/mllib-data-types.md @@ -643,7 +643,7 @@ entries = sc.parallelize([MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), Matrix # - or using (long, long, float) tuples: entries = sc.parallelize([(0, 0, 1.2), (1, 0, 2.1), (2, 1, 3.7)]) -# Create an CoordinateMatrix from an RDD of MatrixEntries. +# Create a CoordinateMatrix from an RDD of MatrixEntries. mat = CoordinateMatrix(entries) # Get its size. diff --git a/docs/monitoring.md b/docs/monitoring.md index 15a6cbd910210..c6105188f07ec 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -421,7 +421,7 @@ to handle the Spark Context setup and tear down. In addition to viewing the metrics in the UI, they are also available as JSON. This gives developers an easy way to create new visualizations and monitoring tools for Spark. The JSON is available for -both running applications, and in the history server. The endpoints are mounted at `/api/v1`. Eg., +both running applications, and in the history server. The endpoints are mounted at `/api/v1`. For example, for the history server, they would typically be accessible at `http://:18080/api/v1`, and for a running application, at `http://localhost:4040/api/v1`. @@ -951,11 +951,11 @@ These endpoints have been strongly versioned to make it easier to develop applic * Individual fields will never be removed for any given endpoint * New endpoints may be added * New fields may be added to existing endpoints -* New versions of the api may be added in the future as a separate endpoint (eg., `api/v2`). New versions are *not* required to be backwards compatible. +* New versions of the api may be added in the future as a separate endpoint (e.g., `api/v2`). New versions are *not* required to be backwards compatible. * Api versions may be dropped, but only after at least one minor release of co-existing with a new api version. Note that even when examining the UI of running applications, the `applications/[app-id]` portion is -still required, though there is only one application available. Eg. to see the list of jobs for the +still required, though there is only one application available. E.g. to see the list of jobs for the running app, you would go to `http://localhost:4040/api/v1/applications/[app-id]/jobs`. This is to keep the paths consistent in both modes. diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md index 5ec7a2c6f0bf4..71b7df8176d1b 100644 --- a/docs/running-on-kubernetes.md +++ b/docs/running-on-kubernetes.md @@ -1079,7 +1079,7 @@ See the [configuration page](configuration.html) for information on Spark config 0.1 This sets the Memory Overhead Factor that will allocate memory to non-JVM memory, which includes off-heap memory allocations, non-JVM tasks, and various systems processes. For JVM-based jobs this value will default to 0.10 and 0.40 for non-JVM jobs. - This is done as non-JVM tasks need more non-JVM heap space and such tasks commonly fail with "Memory Overhead Exceeded" errors. This prempts this error with a higher default. + This is done as non-JVM tasks need more non-JVM heap space and such tasks commonly fail with "Memory Overhead Exceeded" errors. This preempts this error with a higher default. 2.4.0 @@ -1402,4 +1402,4 @@ Kubernetes does not tell Spark the addresses of the resources allocated to each ### Stage Level Scheduling Overview Stage level scheduling is supported on Kubernetes when dynamic allocation is enabled. This also requires spark.dynamicAllocation.shuffleTracking.enabled to be enabled since Kubernetes doesn't support an external shuffle service at this time. The order in which containers for different profiles is requested from Kubernetes is not guaranteed. Note that since dynamic allocation on Kubernetes requires the shuffle tracking feature, this means that executors from previous stages that used a different ResourceProfile may not idle timeout due to having shuffle data on them. This could result in using more cluster resources and in the worst case if there are no remaining resources on the Kubernetes cluster then Spark could potentially hang. You may consider looking at config spark.dynamicAllocation.shuffleTracking.timeout to set a timeout, but that could result in data having to be recomputed if the shuffle data is really needed. -Note, there is a difference in the way pod template resources are handled between the base default profile and custom ResourceProfiles. Any resources specified in the pod template file will only be used with the base default profile. If you create custom ResourceProfiles be sure to include all necessary resources there since the resources from the template file will not be propogated to custom ResourceProfiles. +Note, there is a difference in the way pod template resources are handled between the base default profile and custom ResourceProfiles. Any resources specified in the pod template file will only be used with the base default profile. If you create custom ResourceProfiles be sure to include all necessary resources there since the resources from the template file will not be propagated to custom ResourceProfiles. diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md index 80591bd08650a..8c0bac1815bbd 100644 --- a/docs/running-on-mesos.md +++ b/docs/running-on-mesos.md @@ -857,7 +857,7 @@ See the [configuration page](configuration.html) for information on Spark config host Provides support for the `local:///` scheme to reference the app jar resource in cluster mode. - If user uses a local resource (`local:///path/to/jar`) and the config option is not used it defaults to `host` eg. + If user uses a local resource (`local:///path/to/jar`) and the config option is not used it defaults to `host` e.g. the mesos fetcher tries to get the resource from the host's file system. If the value is unknown it prints a warning msg in the dispatcher logs and defaults to `host`. If the value is `container` then spark submit in the container will use the jar in the container's path: diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index 73c4930dadbd5..797d18a0d4139 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -644,7 +644,7 @@ YARN does not tell Spark the addresses of the resources allocated to each contai # Stage Level Scheduling Overview Stage level scheduling is supported on YARN when dynamic allocation is enabled. One thing to note that is YARN specific is that each ResourceProfile requires a different container priority on YARN. The mapping is simply the ResourceProfile id becomes the priority, on YARN lower numbers are higher priority. This means that profiles created earlier will have a higher priority in YARN. Normally this won't matter as Spark finishes one stage before starting another one, the only case this might have an affect is in a job server type scenario, so its something to keep in mind. -Note there is a difference in the way custom resources are handled between the base default profile and custom ResourceProfiles. To allow for the user to request YARN containers with extra resources without Spark scheduling on them, the user can specify resources via the spark.yarn.executor.resource. config. Those configs are only used in the base default profile though and do not get propogated into any other custom ResourceProfiles. This is because there would be no way to remove them if you wanted a stage to not have them. This results in your default profile getting custom resources defined in spark.yarn.executor.resource. plus spark defined resources of GPU or FPGA. Spark converts GPU and FPGA resources into the YARN built in types yarn.io/gpu) and yarn.io/fpga, but does not know the mapping of any other resources. Any other Spark custom resources are not propogated to YARN for the default profile. So if you want Spark to schedule based off a custom resource and have it requested from YARN, you must specify it in both YARN (spark.yarn.{driver/executor}.resource.) and Spark (spark.{driver/executor}.resource.) configs. Leave the Spark config off if you only want YARN containers with the extra resources but Spark not to schedule using them. Now for custom ResourceProfiles, it doesn't currently have a way to only specify YARN resources without Spark scheduling off of them. This means for custom ResourceProfiles we propogate all the resources defined in the ResourceProfile to YARN. We still convert GPU and FPGA to the YARN build in types as well. This requires that the name of any custom resources you specify match what they are defined as in YARN. +Note there is a difference in the way custom resources are handled between the base default profile and custom ResourceProfiles. To allow for the user to request YARN containers with extra resources without Spark scheduling on them, the user can specify resources via the spark.yarn.executor.resource. config. Those configs are only used in the base default profile though and do not get propagated into any other custom ResourceProfiles. This is because there would be no way to remove them if you wanted a stage to not have them. This results in your default profile getting custom resources defined in spark.yarn.executor.resource. plus spark defined resources of GPU or FPGA. Spark converts GPU and FPGA resources into the YARN built in types yarn.io/gpu) and yarn.io/fpga, but does not know the mapping of any other resources. Any other Spark custom resources are not propagated to YARN for the default profile. So if you want Spark to schedule based off a custom resource and have it requested from YARN, you must specify it in both YARN (spark.yarn.{driver/executor}.resource.) and Spark (spark.{driver/executor}.resource.) configs. Leave the Spark config off if you only want YARN containers with the extra resources but Spark not to schedule using them. Now for custom ResourceProfiles, it doesn't currently have a way to only specify YARN resources without Spark scheduling off of them. This means for custom ResourceProfiles we propagate all the resources defined in the ResourceProfile to YARN. We still convert GPU and FPGA to the YARN build in types as well. This requires that the name of any custom resources you specify match what they are defined as in YARN. # Important notes diff --git a/docs/sparkr.md b/docs/sparkr.md index 05310f89f278d..002da5a56fa9e 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -671,7 +671,7 @@ Arrow R library is available on CRAN and it can be installed as below. ```bash Rscript -e 'install.packages("arrow", repos="https://cloud.r-project.org/")' ``` -Please refer [the official documentation of Apache Arrow](https://arrow.apache.org/docs/r/) for more detials. +Please refer [the official documentation of Apache Arrow](https://arrow.apache.org/docs/r/) for more details. Note that you must ensure that Arrow R package is installed and available on all cluster nodes. The current supported minimum version is 1.0.0; however, this might change between the minor releases since Arrow optimization in SparkR is experimental. diff --git a/docs/sql-data-sources-jdbc.md b/docs/sql-data-sources-jdbc.md index b95be0974585e..7d60915e2a65e 100644 --- a/docs/sql-data-sources-jdbc.md +++ b/docs/sql-data-sources-jdbc.md @@ -131,7 +131,7 @@ the following case-insensitive options: fetchsize - The JDBC fetch size, which determines how many rows to fetch per round trip. This can help performance on JDBC drivers which default to low fetch size (eg. Oracle with 10 rows). This option applies only to reading. + The JDBC fetch size, which determines how many rows to fetch per round trip. This can help performance on JDBC drivers which default to low fetch size (e.g. Oracle with 10 rows). This option applies only to reading. diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 7997090e710a9..2c86e7a932637 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -333,7 +333,7 @@ license: | - - Since Spark 2.4, when there is a struct field in front of the IN operator before a subquery, the inner query must contain a struct field as well. In previous versions, instead, the fields of the struct were compared to the output of the inner query. Eg. if `a` is a `struct(a string, b int)`, in Spark 2.4 `a in (select (1 as a, 'a' as b) from range(1))` is a valid query, while `a in (select 1, 'a' from range(1))` is not. In previous version it was the opposite. + - Since Spark 2.4, when there is a struct field in front of the IN operator before a subquery, the inner query must contain a struct field as well. In previous versions, instead, the fields of the struct were compared to the output of the inner query. For example, if `a` is a `struct(a string, b int)`, in Spark 2.4 `a in (select (1 as a, 'a' as b) from range(1))` is a valid query, while `a in (select 1, 'a' from range(1))` is not. In previous version it was the opposite. - In versions 2.2.1+ and 2.3, if `spark.sql.caseSensitive` is set to true, then the `CURRENT_DATE` and `CURRENT_TIMESTAMP` functions incorrectly became case-sensitive and would resolve to columns (unless typed in lower case). In Spark 2.4 this has been fixed and the functions are no longer case-sensitive. @@ -532,11 +532,11 @@ license: | - Since Spark 2.3, by default arithmetic operations between decimals return a rounded value if an exact representation is not possible (instead of returning NULL). This is compliant with SQL ANSI 2011 specification and Hive's new behavior introduced in Hive 2.2 (HIVE-15331). This involves the following changes - - The rules to determine the result type of an arithmetic operation have been updated. In particular, if the precision / scale needed are out of the range of available values, the scale is reduced up to 6, in order to prevent the truncation of the integer part of the decimals. All the arithmetic operations are affected by the change, ie. addition (`+`), subtraction (`-`), multiplication (`*`), division (`/`), remainder (`%`) and positive module (`pmod`). + - The rules to determine the result type of an arithmetic operation have been updated. In particular, if the precision / scale needed are out of the range of available values, the scale is reduced up to 6, in order to prevent the truncation of the integer part of the decimals. All the arithmetic operations are affected by the change, i.e. addition (`+`), subtraction (`-`), multiplication (`*`), division (`/`), remainder (`%`) and positive modulus (`pmod`). - Literal values used in SQL operations are converted to DECIMAL with the exact precision and scale needed by them. - - The configuration `spark.sql.decimalOperations.allowPrecisionLoss` has been introduced. It defaults to `true`, which means the new behavior described here; if set to `false`, Spark uses previous rules, ie. it doesn't adjust the needed scale to represent the values and it returns NULL if an exact representation of the value is not possible. + - The configuration `spark.sql.decimalOperations.allowPrecisionLoss` has been introduced. It defaults to `true`, which means the new behavior described here; if set to `false`, Spark uses previous rules, i.e. it doesn't adjust the needed scale to represent the values and it returns NULL if an exact representation of the value is not possible. - Un-aliased subquery's semantic has not been well defined with confusing behaviors. Since Spark 2.3, we invalidate such confusing cases, for example: `SELECT v.i from (SELECT i FROM v)`, Spark will throw an analysis exception in this case because users should not be able to use the qualifier inside a subquery. See [SPARK-20690](https://issues.apache.org/jira/browse/SPARK-20690) and [SPARK-21335](https://issues.apache.org/jira/browse/SPARK-21335) for more details. diff --git a/docs/sql-ref-syntax-aux-conf-mgmt-set-timezone.md b/docs/sql-ref-syntax-aux-conf-mgmt-set-timezone.md index 47dd2be77ae90..ada86d8dd3913 100644 --- a/docs/sql-ref-syntax-aux-conf-mgmt-set-timezone.md +++ b/docs/sql-ref-syntax-aux-conf-mgmt-set-timezone.md @@ -43,7 +43,7 @@ SET TIME ZONE INTERVAL interval_literal * **interval_literal** - The [interval literal](sql-ref-literals.html#interval-literal) represents the difference between the session time zone to the 'UTC'. It must be in the range of [-18, 18] hours and max to second precision, e.g. `INTERVAL 2 HOURS 30 MINITUES` or `INTERVAL '15:40:32' HOUR TO SECOND`. + The [interval literal](sql-ref-literals.html#interval-literal) represents the difference between the session time zone to the 'UTC'. It must be in the range of [-18, 18] hours and max to second precision, e.g. `INTERVAL 2 HOURS 30 MINUTES` or `INTERVAL '15:40:32' HOUR TO SECOND`. ### Examples diff --git a/docs/sql-ref-syntax-ddl-create-table-hiveformat.md b/docs/sql-ref-syntax-ddl-create-table-hiveformat.md index 3a8c8d5b1160a..11ec2f1d9ea85 100644 --- a/docs/sql-ref-syntax-ddl-create-table-hiveformat.md +++ b/docs/sql-ref-syntax-ddl-create-table-hiveformat.md @@ -42,10 +42,10 @@ CREATE [ EXTERNAL ] TABLE [ IF NOT EXISTS ] table_identifier row_format: : SERDE serde_class [ WITH SERDEPROPERTIES (k1=v1, k2=v2, ... ) ] - | DELIMITED [ FIELDS TERMINATED BY fields_termiated_char [ ESCAPED BY escaped_char ] ] - [ COLLECTION ITEMS TERMINATED BY collection_items_termiated_char ] - [ MAP KEYS TERMINATED BY map_key_termiated_char ] - [ LINES TERMINATED BY row_termiated_char ] + | DELIMITED [ FIELDS TERMINATED BY fields_terminated_char [ ESCAPED BY escaped_char ] ] + [ COLLECTION ITEMS TERMINATED BY collection_items_terminated_char ] + [ MAP KEYS TERMINATED BY map_key_terminated_char ] + [ LINES TERMINATED BY row_terminated_char ] [ NULL DEFINED AS null_char ] ``` diff --git a/docs/sql-ref-syntax-dml-insert-into.md b/docs/sql-ref-syntax-dml-insert-into.md index ed5da2b2d28df..39d15808d033e 100644 --- a/docs/sql-ref-syntax-dml-insert-into.md +++ b/docs/sql-ref-syntax-dml-insert-into.md @@ -69,11 +69,11 @@ INSERT INTO students VALUES ('Amy Smith', '123 Park Ave, San Jose', 111111); SELECT * FROM students; -+---------+---------------------+----------+ -| name| address|student_id| -+---------+---------------------+----------+ -|Amy Smith|123 Park Ave,San Jose| 111111| -+---------+---------------------+----------+ ++---------+----------------------+----------+ +| name| address|student_id| ++---------+----------------------+----------+ +|Amy Smith|123 Park Ave, San Jose| 111111| ++---------+----------------------+----------+ ``` #### Multi-Row Insert Using a VALUES Clause @@ -100,29 +100,29 @@ SELECT * FROM students; ```sql -- Assuming the persons table has already been created and populated. SELECT * FROM persons; -+-------------+-------------------------+---------+ -| name| address| ssn| -+-------------+-------------------------+---------+ -|Dora Williams|134 Forest Ave, Melo Park|123456789| -+-------------+-------------------------+---------+ -| Eddie Davis| 245 Market St, Milpitas|345678901| -+-------------+-------------------------+---------+ ++-------------+--------------------------+---------+ +| name| address| ssn| ++-------------+--------------------------+---------+ +|Dora Williams|134 Forest Ave, Menlo Park|123456789| ++-------------+--------------------------+---------+ +| Eddie Davis| 245 Market St, Milpitas|345678901| ++-------------+--------------------------+---------+ INSERT INTO students PARTITION (student_id = 444444) SELECT name, address FROM persons WHERE name = "Dora Williams"; SELECT * FROM students; -+-------------+-------------------------+----------+ -| name| address|student_id| -+-------------+-------------------------+----------+ -| Amy Smith| 123 Park Ave, San Jose| 111111| -+-------------+-------------------------+----------+ -| Bob Brown| 456 Taylor St, Cupertino| 222222| -+-------------+-------------------------+----------+ -|Cathy Johnson| 789 Race Ave, Palo Alto| 333333| -+-------------+-------------------------+----------+ -|Dora Williams|134 Forest Ave, Melo Park| 444444| -+-------------+-------------------------+----------+ ++-------------+--------------------------+----------+ +| name| address|student_id| ++-------------+--------------------------+----------+ +| Amy Smith| 123 Park Ave, San Jose| 111111| ++-------------+--------------------------+----------+ +| Bob Brown| 456 Taylor St, Cupertino| 222222| ++-------------+--------------------------+----------+ +|Cathy Johnson| 789 Race Ave, Palo Alto| 333333| ++-------------+--------------------------+----------+ +|Dora Williams|134 Forest Ave, Menlo Park| 444444| ++-------------+--------------------------+----------+ ``` #### Insert Using a TABLE Statement @@ -141,21 +141,21 @@ SELECT * FROM visiting_students; INSERT INTO students TABLE visiting_students; SELECT * FROM students; -+-------------+-------------------------+----------+ -| name| address|student_id| -+-------------+-------------------------+----------+ -| Amy Smith| 123 Park Ave,San Jose| 111111| -+-------------+-------------------------+----------+ -| Bob Brown| 456 Taylor St, Cupertino| 222222| -+-------------+-------------------------+----------+ -|Cathy Johnson| 789 Race Ave, Palo Alto| 333333| -+-------------+-------------------------+----------+ -|Dora Williams|134 Forest Ave, Melo Park| 444444| -+-------------+-------------------------+----------+ -|Fleur Laurent| 345 Copper St, London| 777777| -+-------------+-------------------------+----------+ -|Gordon Martin| 779 Lake Ave, Oxford| 888888| -+-------------+-------------------------+----------+ ++-------------+--------------------------+----------+ +| name| address|student_id| ++-------------+--------------------------+----------+ +| Amy Smith| 123 Park Ave, San Jose| 111111| ++-------------+--------------------------+----------+ +| Bob Brown| 456 Taylor St, Cupertino| 222222| ++-------------+--------------------------+----------+ +|Cathy Johnson| 789 Race Ave, Palo Alto| 333333| ++-------------+--------------------------+----------+ +|Dora Williams|134 Forest Ave, Menlo Park| 444444| ++-------------+--------------------------+----------+ +|Fleur Laurent| 345 Copper St, London| 777777| ++-------------+--------------------------+----------+ +|Gordon Martin| 779 Lake Ave, Oxford| 888888| ++-------------+--------------------------+----------+ ``` #### Insert Using a FROM Statement @@ -177,25 +177,25 @@ INSERT INTO students FROM applicants SELECT name, address, id applicants WHERE qualified = true; SELECT * FROM students; -+-------------+-------------------------+----------+ -| name| address|student_id| -+-------------+-------------------------+----------+ -| Amy Smith| 123 Park Ave, San Jose| 111111| -+-------------+-------------------------+----------+ -| Bob Brown| 456 Taylor St, Cupertino| 222222| -+-------------+-------------------------+----------+ -|Cathy Johnson| 789 Race Ave, Palo Alto| 333333| -+-------------+-------------------------+----------+ -|Dora Williams|134 Forest Ave, Melo Park| 444444| -+-------------+-------------------------+----------+ -|Fleur Laurent| 345 Copper St, London| 777777| -+-------------+-------------------------+----------+ -|Gordon Martin| 779 Lake Ave, Oxford| 888888| -+-------------+-------------------------+----------+ -| Helen Davis|469 Mission St, San Diego| 999999| -+-------------+-------------------------+----------+ -| Jason Wang| 908 Bird St, Saratoga| 121212| -+-------------+-------------------------+----------+ ++-------------+--------------------------+----------+ +| name| address|student_id| ++-------------+--------------------------+----------+ +| Amy Smith| 123 Park Ave, San Jose| 111111| ++-------------+--------------------------+----------+ +| Bob Brown| 456 Taylor St, Cupertino| 222222| ++-------------+--------------------------+----------+ +|Cathy Johnson| 789 Race Ave, Palo Alto| 333333| ++-------------+--------------------------+----------+ +|Dora Williams|134 Forest Ave, Menlo Park| 444444| ++-------------+--------------------------+----------+ +|Fleur Laurent| 345 Copper St, London| 777777| ++-------------+--------------------------+----------+ +|Gordon Martin| 779 Lake Ave, Oxford| 888888| ++-------------+--------------------------+----------+ +| Helen Davis| 469 Mission St, San Diego| 999999| ++-------------+--------------------------+----------+ +| Jason Wang| 908 Bird St, Saratoga| 121212| ++-------------+--------------------------+----------+ ``` ### Related Statements diff --git a/docs/sql-ref-syntax-dml-insert-overwrite-table.md b/docs/sql-ref-syntax-dml-insert-overwrite-table.md index ecfd060dfd5ee..638dcb34bb1d2 100644 --- a/docs/sql-ref-syntax-dml-insert-overwrite-table.md +++ b/docs/sql-ref-syntax-dml-insert-overwrite-table.md @@ -64,18 +64,18 @@ INSERT OVERWRITE [ TABLE ] table_identifier [ partition_spec [ IF NOT EXISTS ] ] ```sql -- Assuming the students table has already been created and populated. SELECT * FROM students; -+-------------+-------------------------+----------+ -| name| address|student_id| -+-------------+-------------------------+----------+ -| Amy Smith| 123 Park Ave, San Jose| 111111| -| Bob Brown| 456 Taylor St, Cupertino| 222222| -|Cathy Johnson| 789 Race Ave, Palo Alto| 333333| -|Dora Williams|134 Forest Ave, Melo Park| 444444| -|Fleur Laurent| 345 Copper St, London| 777777| -|Gordon Martin| 779 Lake Ave, Oxford| 888888| -| Helen Davis|469 Mission St, San Diego| 999999| -| Jason Wang| 908 Bird St, Saratoga| 121212| -+-------------+-------------------------+----------+ ++-------------+--------------------------+----------+ +| name| address|student_id| ++-------------+--------------------------+----------+ +| Amy Smith| 123 Park Ave, San Jose| 111111| +| Bob Brown| 456 Taylor St, Cupertino| 222222| +|Cathy Johnson| 789 Race Ave, Palo Alto| 333333| +|Dora Williams|134 Forest Ave, Menlo Park| 444444| +|Fleur Laurent| 345 Copper St, London| 777777| +|Gordon Martin| 779 Lake Ave, Oxford| 888888| +| Helen Davis| 469 Mission St, San Diego| 999999| +| Jason Wang| 908 Bird St, Saratoga| 121212| ++-------------+--------------------------+----------+ INSERT OVERWRITE students VALUES ('Ashua Hill', '456 Erica Ct, Cupertino', 111111), @@ -95,25 +95,25 @@ SELECT * FROM students; ```sql -- Assuming the persons table has already been created and populated. SELECT * FROM persons; -+-------------+-------------------------+---------+ -| name| address| ssn| -+-------------+-------------------------+---------+ -|Dora Williams|134 Forest Ave, Melo Park|123456789| -+-------------+-------------------------+---------+ -| Eddie Davis| 245 Market St,Milpitas|345678901| -+-------------+-------------------------+---------+ ++-------------+--------------------------+---------+ +| name| address| ssn| ++-------------+--------------------------+---------+ +|Dora Williams|134 Forest Ave, Menlo Park|123456789| ++-------------+--------------------------+---------+ +| Eddie Davis| 245 Market St, Milpitas|345678901| ++-------------+--------------------------+---------+ INSERT OVERWRITE students PARTITION (student_id = 222222) SELECT name, address FROM persons WHERE name = "Dora Williams"; SELECT * FROM students; -+-------------+-------------------------+----------+ -| name| address|student_id| -+-------------+-------------------------+----------+ -| Ashua Hill| 456 Erica Ct, Cupertino| 111111| -+-------------+-------------------------+----------+ -|Dora Williams|134 Forest Ave, Melo Park| 222222| -+-------------+-------------------------+----------+ ++-------------+--------------------------+----------+ +| name| address|student_id| ++-------------+--------------------------+----------+ +| Ashua Hill| 456 Erica Ct, Cupertino| 111111| ++-------------+--------------------------+----------+ +|Dora Williams|134 Forest Ave, Menlo Park| 222222| ++-------------+--------------------------+----------+ ``` #### Insert Using a TABLE Statement diff --git a/docs/sql-ref-syntax-qry-select-groupby.md b/docs/sql-ref-syntax-qry-select-groupby.md index 934e5f70d4b08..ef9de1f594a31 100644 --- a/docs/sql-ref-syntax-qry-select-groupby.md +++ b/docs/sql-ref-syntax-qry-select-groupby.md @@ -269,7 +269,7 @@ INSERT INTO person VALUES (300, 'Mike', 80), (400, 'Dan', 50); ---Select the first row in cloumn age +--Select the first row in column age SELECT FIRST(age) FROM person; +--------------------+ | first(age, false) | @@ -277,7 +277,7 @@ SELECT FIRST(age) FROM person; | NULL | +--------------------+ ---Get the first row in cloumn `age` ignore nulls,last row in column `id` and sum of cloumn `id`. +--Get the first row in column `age` ignore nulls,last row in column `id` and sum of column `id`. SELECT FIRST(age IGNORE NULLS), LAST(id), SUM(id) FROM person; +-------------------+------------------+----------+ | first(age, true) | last(id, false) | sum(id) | diff --git a/docs/sql-ref-syntax-qry-select-lateral-view.md b/docs/sql-ref-syntax-qry-select-lateral-view.md index f742c8fa57043..c854625a1a959 100644 --- a/docs/sql-ref-syntax-qry-select-lateral-view.md +++ b/docs/sql-ref-syntax-qry-select-lateral-view.md @@ -58,7 +58,7 @@ INSERT INTO person VALUES (400, 'Dan', 50, 4, 'Street 4'); SELECT * FROM person - LATERAL VIEW EXPLODE(ARRAY(30, 60)) tabelName AS c_age + LATERAL VIEW EXPLODE(ARRAY(30, 60)) tableName AS c_age LATERAL VIEW EXPLODE(ARRAY(40, 80)) AS d_age; +------+-------+-------+--------+-----------+--------+--------+ | id | name | age | class | address | c_age | d_age | @@ -93,14 +93,14 @@ GROUP BY c_age; +--------+-----------+ SELECT * FROM person - LATERAL VIEW EXPLODE(ARRAY()) tabelName AS c_age; + LATERAL VIEW EXPLODE(ARRAY()) tableName AS c_age; +-----+-------+------+--------+----------+--------+ | id | name | age | class | address | c_age | +-----+-------+------+--------+----------+--------+ +-----+-------+------+--------+----------+--------+ SELECT * FROM person - LATERAL VIEW OUTER EXPLODE(ARRAY()) tabelName AS c_age; + LATERAL VIEW OUTER EXPLODE(ARRAY()) tableName AS c_age; +------+-------+-------+--------+-----------+--------+ | id | name | age | class | address | c_age | +------+-------+-------+--------+-----------+--------+ diff --git a/docs/sql-ref-syntax-qry-select-orderby.md b/docs/sql-ref-syntax-qry-select-orderby.md index 13f0ae40cb828..552ee9be66d1e 100644 --- a/docs/sql-ref-syntax-qry-select-orderby.md +++ b/docs/sql-ref-syntax-qry-select-orderby.md @@ -28,7 +28,7 @@ clause, this clause guarantees a total order in the output. ### Syntax ```sql -ORDER BY { expression [ sort_direction | nulls_sort_oder ] [ , ... ] } +ORDER BY { expression [ sort_direction | nulls_sort_order ] [ , ... ] } ``` ### Parameters diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringMetrics.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringMetrics.scala index a785d063f1476..3dea244c77226 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/ClusteringMetrics.scala @@ -127,7 +127,7 @@ private[evaluation] abstract class Silhouette { * `$a_{i}$` can be interpreted as how well `i` is assigned to its cluster * (the smaller the value, the better the assignment), while `$b_{i}$` is * a measure of how well `i` has not been assigned to its "neighboring cluster", - * ie. the nearest cluster to `i`. + * i.e. the nearest cluster to `i`. * * Unfortunately, the naive implementation of the algorithm requires to compute * the distance of each couple of points in the dataset. Since the computation of @@ -486,7 +486,7 @@ private[evaluation] object CosineSilhouette extends Silhouette { * for the point. * @param weightCol The name of the column which contains the instance weight. * @return A [[scala.collection.immutable.Map]] which associates each cluster id to a - * its statistics (ie. the precomputed values `N` and `$\Omega_{\Gamma}$`). + * its statistics (i.e. the precomputed values `N` and `$\Omega_{\Gamma}$`). */ def computeClusterStats( df: DataFrame, diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala index 5ed7619fce5dc..2ec7a8632e39d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala @@ -112,7 +112,7 @@ final class Binarizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) (Seq($(inputCol)), Seq($(outputCol)), Seq($(threshold))) } - val ouputCols = inputColNames.zip(tds).map { case (inputColName, td) => + val mappedOutputCols = inputColNames.zip(tds).map { case (inputColName, td) => val binarizerUDF = dataset.schema(inputColName).dataType match { case DoubleType => udf { in: Double => if (in > td) 1.0 else 0.0 } @@ -147,8 +147,8 @@ final class Binarizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) binarizerUDF(col(inputColName)) } - val ouputMetadata = outputColNames.map(outputSchema(_).metadata) - dataset.withColumns(outputColNames, ouputCols, ouputMetadata) + val outputMetadata = outputColNames.map(outputSchema(_).metadata) + dataset.withColumns(outputColNames, mappedOutputCols, outputMetadata) } @Since("1.4.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Selector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Selector.scala index 46052a89fdf1a..41de26dff03ab 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Selector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Selector.scala @@ -77,7 +77,7 @@ private[feature] trait SelectorParams extends Params * @group param */ @Since("3.1.0") - final val fpr = new DoubleParam(this, "fpr", "The higest p-value for features to be kept.", + final val fpr = new DoubleParam(this, "fpr", "The highest p-value for features to be kept.", ParamValidators.inRange(0, 1)) /** @group getParam */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index b6ed4f2b000cc..8bcd7909b6078 100755 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -166,11 +166,11 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String } val (inputColNames, outputColNames) = getInOutCols() - val ouputCols = inputColNames.map { inputColName => + val outputCols = inputColNames.map { inputColName => t(col(inputColName)) } - val ouputMetadata = outputColNames.map(outputSchema(_).metadata) - dataset.withColumns(outputColNames, ouputCols, ouputMetadata) + val outputMetadata = outputColNames.map(outputSchema(_).metadata) + dataset.withColumns(outputColNames, outputCols, outputMetadata) } @Since("1.5.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala index 5efcf0dce68a2..37b715930a501 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/image/ImageSchema.scala @@ -133,7 +133,7 @@ object ImageSchema { val img = try { ImageIO.read(new ByteArrayInputStream(bytes)) } catch { - // Catch runtime exception because `ImageIO` may throw unexcepted `RuntimeException`. + // Catch runtime exception because `ImageIO` may throw unexpected `RuntimeException`. // But do not catch the declared `IOException` (regarded as FileSystem failure) case _: RuntimeException => null } diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala index 1b5f77a9ae897..594d9f315f508 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala @@ -88,9 +88,9 @@ private[r] object AFTSurvivalRegressionWrapper extends MLReadable[AFTSurvivalReg aggregationDepth: Int, stringIndexerOrderType: String): AFTSurvivalRegressionWrapper = { - val (rewritedFormula, censorCol) = formulaRewrite(formula) + val (rewrittenFormula, censorCol) = formulaRewrite(formula) - val rFormula = new RFormula().setFormula(rewritedFormula) + val rFormula = new RFormula().setFormula(rewrittenFormula) .setStringIndexerOrderType(stringIndexerOrderType) RWrapperUtils.checkDataColumns(rFormula, data) val rFormulaModel = rFormula.fit(data) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/FMRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/FMRegressor.scala index 84c0985245a2e..f70baa4ddd393 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/FMRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/FMRegressor.scala @@ -555,7 +555,7 @@ object FMRegressionModel extends MLReadable[FMRegressionModel] { * \hat{y} = p\left( y_{fm} \right) * }}} * p is the prediction function, for binary classification task is sigmoid. - * The loss funcation gradient formula: + * The loss function gradient formula: * {{{ * \frac{\partial}{\partial\theta} l\left( \hat{y},y \right) = * \frac{\partial}{\partial\theta} l\left( p\left( y_{fm} \right),y \right) = diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala index 087c2c2639831..90cc4fb13b995 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala @@ -146,7 +146,7 @@ class SVMWithSGD private ( /** * Construct a SVM object with default parameters: {stepSize: 1.0, numIterations: 100, - * regParm: 0.01, miniBatchFraction: 1.0}. + * regParam: 0.01, miniBatchFraction: 1.0}. */ @Since("0.8.0") def this() = this(1.0, 100, 0.01, 1.0) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/DistanceMeasure.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/DistanceMeasure.scala index bffed61c291ea..9ac473aabecea 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/DistanceMeasure.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/DistanceMeasure.scala @@ -41,7 +41,7 @@ private[spark] abstract class DistanceMeasure extends Serializable { * 1, if i != j: a bound r = matrix(i,j) to help avoiding unnecessary distance * computation. Given point x, let i be current closest center, and d be current best * distance, if d < f(r), then we no longer need to compute the distance to center j; - * 2, if i == j: a bound r = matrix(i,i) = min_k{maxtrix(i,k)|k!=i}. If distance + * 2, if i == j: a bound r = matrix(i,i) = min_k{matrix(i,k)|k!=i}. If distance * between point x and center i is less than f(r), then center i is the closest center * to point x. */ @@ -268,7 +268,7 @@ private[spark] class EuclideanDistanceMeasure extends DistanceMeasure { * squared distance, if d < r, then we no longer need to compute the distance to center * j. matrix(i,j) equals to squared of half of Euclidean distance between centers i * and j; - * 2, if i == j: a bound r = matrix(i,i) = min_k{maxtrix(i,k)|k!=i}. If squared + * 2, if i == j: a bound r = matrix(i,i) = min_k{matrix(i,k)|k!=i}. If squared * distance between point x and center i is less than r, then center i is the closest * center to point x. */ @@ -405,7 +405,7 @@ private[spark] class CosineDistanceMeasure extends DistanceMeasure { * is used instead of Cosine distance to compute matrix(i,j): for centers i and j, * compute the radian/angle between them, halving it, and converting it back to Cosine * distance at the end; - * 2, if i == j: a bound r = matrix(i,i) = min_k{maxtrix(i,k)|k!=i}. If Cosine + * 2, if i == j: a bound r = matrix(i,i) = min_k{matrix(i,k)|k!=i}. If Cosine * distance between point x and center i is less than r, then center i is the closest * center to point x. */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index b2742ee6ecb5b..c9f6d789d6740 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -466,7 +466,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer with Logging { val seed = randomGenerator.nextLong() // If and only if optimizeDocConcentration is set true, // we calculate logphat in the same pass as other statistics. - // No calculation of loghat happens otherwise. + // No calculation of logphat happens otherwise. val logphatPartOptionBase = () => if (optimizeDocConcentration) { Some(BDV.zeros[Double](k)) } else { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala index 3c9b806d616fc..111030dada491 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala @@ -36,7 +36,7 @@ import org.apache.spark.util.random.XORShiftRandom * doing a single iteration of the standard k-means algorithm. * * The update algorithm uses the "mini-batch" KMeans rule, - * generalized to incorporate forgetfullness (i.e. decay). + * generalized to incorporate forgetfulness (i.e. decay). * The update rule (for each cluster) is: * *
      diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala index c165d4810c934..f7c6d09f5e437 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala @@ -48,11 +48,11 @@ class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) { val mat = if (numFeatures > 65535) { val summary = Statistics.colStats(sources.map((_, 1.0)), Seq("mean")) val mean = Vectors.fromML(summary.mean) - val meanCentredRdd = sources.map { row => + val meanCenteredRdd = sources.map { row => BLAS.axpy(-1, mean, row) row } - new RowMatrix(meanCentredRdd) + new RowMatrix(meanCenteredRdd) } else { require(PCAUtil.memoryCost(k, numFeatures) < Int.MaxValue, "The param k and numFeatures is too large for SVD computation. " + diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 13899fa8296f6..eeb583f84ca8b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -560,7 +560,7 @@ class Word2VecModel private[spark] ( /** * Find synonyms of the vector representation of a word, possibly - * including any words in the model vocabulary whose vector respresentation + * including any words in the model vocabulary whose vector representation * is the supplied vector. * @param vector vector representation of a word * @param num number of synonyms to find diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala index 601c7da30ffed..606e2f2f212ca 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala @@ -88,8 +88,8 @@ class AssociationRules private[fpm] ( // Join to get (X, ((Y, freq(X union Y)), freq(X))), generate rules, and filter by confidence candidates.join(freqItemsets.map(x => (x.items.toSeq, x.freq))) - .map { case (antecendent, ((consequent, freqUnion), freqAntecedent)) => - new Rule(antecendent.toArray, + .map { case (antecedent, ((consequent, freqUnion), freqAntecedent)) => + new Rule(antecedent.toArray, consequent.toArray, freqUnion, freqAntecedent, diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index c618b71ddc5a8..d546f0c1a8e19 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -693,11 +693,11 @@ class RowMatrix @Since("1.0.0") ( val pBV = sc.broadcast(colMagsCorrected.map(c => sg / c)) val qBV = sc.broadcast(colMagsCorrected.map(c => math.min(sg, c))) - val sims = rows.mapPartitionsWithIndex { (indx, iter) => + val sims = rows.mapPartitionsWithIndex { (index, iter) => val p = pBV.value val q = qBV.value - val rand = new XORShiftRandom(indx) + val rand = new XORShiftRandom(index) val scaled = new Array[Double](p.size) iter.flatMap { row => row match { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala index d17f7047c5b2b..778de30e756c0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala @@ -91,7 +91,7 @@ private[stat] object KolmogorovSmirnovTest extends Logging { * @param partData `Iterator[Double]` 1 partition of a sorted RDD * @param n `Double` the total size of the RDD * @param cdf `Double => Double` a function the calculates the theoretical CDF of a value - * @return `Iterator[(Double, Double)] `Unadjusted (ie. off by a constant) potential extrema + * @return `Iterator[(Double, Double)] `Unadjusted (i.e. off by a constant) potential extrema * in a partition. The first element corresponds to the (empirical CDF - 1/N) - CDF, * the second element corresponds to empirical CDF - CDF. We can then search the resulting * iterator for the minimum of the first and the maximum of the second element, and provide diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java index 6480b57e1f796..af32e03854b53 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaStopWordsRemoverSuite.java @@ -41,7 +41,7 @@ public void javaCompatibilityTest() { .setOutputCol("filtered"); List data = Arrays.asList( - RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")), + RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")), RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb")) ); StructType schema = new StructType(new StructField[]{ diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala index b35f964c959bf..0eae23df8358d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala @@ -181,7 +181,7 @@ class GaussianMixtureSuite extends MLTest with DefaultReadWriteTest { } } - test("multivariate data and check againt R mvnormalmixEM") { + test("multivariate data and check against R mvnormalmixEM") { /* Using the following R code to generate data and train the model using mixtools package. library(mvtnorm) diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala index 5ee161ce8dd33..deaad2bd54d0e 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala @@ -36,7 +36,7 @@ class RegressionEvaluatorSuite test("Regression Evaluator: default params") { /** * Here is the instruction describing how to export the test data into CSV format - * so we can validate the metrics compared with R's mmetric package. + * so we can validate the metrics compared with R's mmetric function. * * import org.apache.spark.mllib.util.LinearDataGenerator * val data = sc.parallelize(LinearDataGenerator.generateLinearInput(6.3, diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ANOVASelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ANOVASelectorSuite.scala index 1e1ab206cc1c2..0d664e421da4c 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ANOVASelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ANOVASelectorSuite.scala @@ -133,35 +133,35 @@ class ANOVASelectorSuite extends MLTest with DefaultReadWriteTest { ParamsSuite.checkParams(new ANOVASelector()) } - test("Test ANOVAFValue calssification selector: numTopFeatures") { + test("Test ANOVAFValue classification selector: numTopFeatures") { val selector = new ANOVASelector() .setOutputCol("filtered").setSelectorType("numTopFeatures").setNumTopFeatures(1) val model = testSelector(selector, dataset) MLTestingUtils.checkCopyAndUids(selector, model) } - test("Test ANOVAFValue calssification selector: percentile") { + test("Test ANOVAFValue classification selector: percentile") { val selector = new ANOVASelector() .setOutputCol("filtered").setSelectorType("percentile").setPercentile(0.17) val model = testSelector(selector, dataset) MLTestingUtils.checkCopyAndUids(selector, model) } - test("Test ANOVAFValue calssification selector: fpr") { + test("Test ANOVAFValue classification selector: fpr") { val selector = new ANOVASelector() .setOutputCol("filtered").setSelectorType("fpr").setFpr(1.0E-12) val model = testSelector(selector, dataset) MLTestingUtils.checkCopyAndUids(selector, model) } - test("Test ANOVAFValue calssification selector: fdr") { + test("Test ANOVAFValue classification selector: fdr") { val selector = new ANOVASelector() .setOutputCol("filtered").setSelectorType("fdr").setFdr(6.0E-12) val model = testSelector(selector, dataset) MLTestingUtils.checkCopyAndUids(selector, model) } - test("Test ANOVAFValue calssification selector: fwe") { + test("Test ANOVAFValue classification selector: fwe") { val selector = new ANOVASelector() .setOutputCol("filtered").setSelectorType("fwe").setFwe(6.0E-12) val model = testSelector(selector, dataset) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/DCTSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/DCTSuite.scala index 19645b517d79c..8f8365a59082b 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/DCTSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/DCTSuite.scala @@ -81,7 +81,7 @@ class DCTSuite extends MLTest with DefaultReadWriteTest { .map { case Row(vec: Vector) => vec.size } .head() - // Can not infer size of ouput vector, since no metadata is provided + // Can not infer size of output vector, since no metadata is provided intercept[TestFailedException] { val transformed = transformer.transform(dataset) checkVectorSizeOnDF(transformed, "resultVec", vectorSize) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala index 93564681994d7..55dade28920ed 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.types.DataTypes private[ml] object LSHTest { /** - * For any locality sensitive function h in a metric space, we meed to verify whether + * For any locality sensitive function h in a metric space, we need to verify whether * the following property is satisfied. * * There exist dist1, dist2, p1, p2, so that for any two elements e1 and e2, diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VarianceThresholdSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VarianceThresholdSelectorSuite.scala index cc451c0b60379..142abf2ccdfb9 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/VarianceThresholdSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VarianceThresholdSelectorSuite.scala @@ -53,7 +53,7 @@ class VarianceThresholdSelectorSuite extends MLTest with DefaultReadWriteTest { ParamsSuite.checkParams(new VarianceThresholdSelector) } - test("Test VarianceThresholdSelector: varainceThreshold not set") { + test("Test VarianceThresholdSelector: varianceThreshold not set") { val selector = new VarianceThresholdSelector().setOutputCol("filtered") val model = testSelector(selector, dataset) MLTestingUtils.checkCopyAndUids(selector, model) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index a0e17a4b40fd2..bfa9f4b59511c 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -494,7 +494,7 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest [1] -0.0457441 -0.6833928 [1] 1.8121235 -0.1747493 -0.5815417 - R code for deivance calculation: + R code for deviance calculation: data = cbind(y=c(0,1,0,0,0,1), x1=c(18, 12, 15, 13, 15, 16), x2=c(1,0,0,2,1,1)) summary(glm(y~x1+x2, family=poisson, data=data.frame(data)))$deviance [1] 3.70055 @@ -1661,7 +1661,7 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest } test("evaluate with labels that are not doubles") { - // Evaulate with a dataset that contains Labels not as doubles to verify correct casting + // Evaluate with a dataset that contains Labels not as doubles to verify correct casting val dataset = Seq( Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse), Instance(19.0, 1.0, Vectors.dense(1.0, 7.0)), diff --git a/pom.xml b/pom.xml index cd7e1767d6b18..f0ad9b0167c32 100644 --- a/pom.xml +++ b/pom.xml @@ -229,7 +229,7 @@ declared in the projects that build assemblies. For other projects the scope should remain as "compile", otherwise they are not available - during compilation if the dependency is transivite (e.g. "graphx/" depending on "core/" and + during compilation if the dependency is transitive (e.g. "graphx/" depending on "core/" and needing Hadoop classes in the classpath to compile). --> compile @@ -1758,7 +1758,7 @@ ${hive.deps.scope} - + ${hive.group} hive-metastore diff --git a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala index f696e93e9cef2..386de19e919e6 100644 --- a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala +++ b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala @@ -113,10 +113,9 @@ class ExecutorClassLoaderSuite val classLoader = new ExecutorClassLoader( new SparkConf(), null, url1, parentLoader, true) - // load 'scala.Option', using ClassforName to do the exact same behavior as - // what JavaDeserializationStream does - // scalastyle:off classforname + // load 'scala.Option', using Class.forName to do the exact same behavior as + // what JavaDeserializationStream does val optionClass = Class.forName("scala.Option", false, classLoader) // scalastyle:on classforname From 2da72593c1cf63fc6f815416b8d553f0a53f3e65 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Mon, 30 Nov 2020 05:23:23 +0000 Subject: [PATCH 043/150] [SPARK-32976][SQL] Support column list in INSERT statement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes were proposed in this pull request? #### JIRA expectations ``` INSERT currently does not support named column lists. INSERT INTO (col1, col2,…) VALUES( 'val1', 'val2', … ) Note, we assume the column list contains all the column names. Issue an exception if the list is not complete. The column order could be different from the column order defined in the table definition. ``` #### implemetations In this PR, we add a column list as an optional part to the `INSERT OVERWRITE/INTO` statements: ``` /** * {{{ * INSERT OVERWRITE TABLE tableIdentifier [partitionSpec [IF NOT EXISTS]]? [identifierList] ... * INSERT INTO [TABLE] tableIdentifier [partitionSpec] [identifierList] ... * }}} */ ``` The column list represents all expected columns with an explicit order that you want to insert to the target table. **Particularly**, we assume the column list contains all the column names in the current implementation, it will fail when the list is incomplete. In **Analyzer**, we add a code path to resolve the column list in the `ResolveOutputRelation` rule before it is transformed to v1 or v2 command. It will fail here if the list has any field that not belongs to the target table. Then, for v2 command, e.g. `AppendData`, we use the resolved column list and output of the target table to resolve the output of the source query `ResolveOutputRelation` rule. If the list has duplicated columns, we fail. If the list is not empty but the list size does not match the target table, we fail. If no other exceptions occur, we use the column list to map the output of the source query to the output of the target table. The column list will be set to Nil and it will not hit the rule again after it is resolved. for v1 command, those all happen in the `PreprocessTableInsertion` rule ### Why are the changes needed? new feature support ### Does this PR introduce _any_ user-facing change? yes, insert into/overwrite table support specify column list ### How was this patch tested? new tests Closes #29893 from yaooqinn/SPARK-32976. Authored-by: Kent Yao Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/parser/SqlBase.g4 | 4 +- .../sql/catalyst/analysis/Analyzer.scala | 52 ++++- .../sql/catalyst/analysis/CheckAnalysis.scala | 2 +- .../spark/sql/catalyst/dsl/package.scala | 2 +- .../sql/catalyst/parser/AstBuilder.scala | 20 +- .../catalyst/plans/logical/statements.scala | 2 + .../sql/catalyst/parser/DDLParserSuite.scala | 66 ++++++ .../sql/catalyst/parser/PlanParserSuite.scala | 4 +- .../apache/spark/sql/DataFrameWriter.scala | 1 + .../datasources/DataSourceStrategy.scala | 10 +- .../datasources/FallBackFileSourceV2.scala | 4 +- .../sql/execution/datasources/rules.scala | 6 +- .../apache/spark/sql/SQLInsertTestSuite.scala | 221 ++++++++++++++++++ .../command/PlanResolutionSuite.scala | 2 +- .../spark/sql/hive/HiveStrategies.scala | 9 +- .../sql/hive/HiveSQLInsertTestSuite.scala | 25 ++ 16 files changed, 396 insertions(+), 34 deletions(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/SQLInsertTestSuite.scala create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSQLInsertTestSuite.scala diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 5d17028c32ae2..a23994f456f75 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -332,8 +332,8 @@ query ; insertInto - : INSERT OVERWRITE TABLE? multipartIdentifier (partitionSpec (IF NOT EXISTS)?)? #insertOverwriteTable - | INSERT INTO TABLE? multipartIdentifier partitionSpec? (IF NOT EXISTS)? #insertIntoTable + : INSERT OVERWRITE TABLE? multipartIdentifier (partitionSpec (IF NOT EXISTS)?)? identifierList? #insertOverwriteTable + | INSERT INTO TABLE? multipartIdentifier partitionSpec? (IF NOT EXISTS)? identifierList? #insertIntoTable | INSERT OVERWRITE LOCAL? DIRECTORY path=STRING rowFormat? createFileFormat? #insertOverwriteHiveDir | INSERT OVERWRITE LOCAL? DIRECTORY (path=STRING)? tableProvider (OPTIONS options=tablePropertyList)? #insertOverwriteDir ; diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index dae496244c858..9b599b4c8f8d4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -49,7 +49,7 @@ import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.{PartitionOverwriteMode, StoreAssignmentPolicy} import org.apache.spark.sql.types._ -import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.sql.util.{CaseInsensitiveStringMap, SchemaUtils} import org.apache.spark.util.Utils /** @@ -218,6 +218,7 @@ class Analyzer(override val catalogManager: CatalogManager) ResolveTableValuedFunctions :: ResolveNamespace(catalogManager) :: new ResolveCatalogs(catalogManager) :: + ResolveUserSpecifiedColumns :: ResolveInsertInto :: ResolveRelations :: ResolveTables :: @@ -846,7 +847,7 @@ class Analyzer(override val catalogManager: CatalogManager) def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { case u @ UnresolvedRelation(ident, _, isStreaming) => lookupTempView(ident, isStreaming).getOrElse(u) - case i @ InsertIntoStatement(UnresolvedRelation(ident, _, false), _, _, _, _) => + case i @ InsertIntoStatement(UnresolvedRelation(ident, _, false), _, _, _, _, _) => lookupTempView(ident) .map(view => i.copy(table = view)) .getOrElse(i) @@ -961,7 +962,7 @@ class Analyzer(override val catalogManager: CatalogManager) .map(ResolvedTable(catalog.asTableCatalog, ident, _)) .getOrElse(u) - case i @ InsertIntoStatement(u @ UnresolvedRelation(_, _, false), _, _, _, _) + case i @ InsertIntoStatement(u @ UnresolvedRelation(_, _, false), _, _, _, _, _) if i.query.resolved => lookupV2Relation(u.multipartIdentifier, u.options, false) .map(v2Relation => i.copy(table = v2Relation)) @@ -1045,7 +1046,7 @@ class Analyzer(override val catalogManager: CatalogManager) } def apply(plan: LogicalPlan): LogicalPlan = ResolveTempViews(plan).resolveOperatorsUp { - case i @ InsertIntoStatement(table, _, _, _, _) if i.query.resolved => + case i @ InsertIntoStatement(table, _, _, _, _, _) if i.query.resolved => val relation = table match { case u @ UnresolvedRelation(_, _, false) => lookupRelation(u.multipartIdentifier, u.options, false).getOrElse(u) @@ -1160,7 +1161,8 @@ class Analyzer(override val catalogManager: CatalogManager) object ResolveInsertInto extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { - case i @ InsertIntoStatement(r: DataSourceV2Relation, _, _, _, _) if i.query.resolved => + case i @ InsertIntoStatement(r: DataSourceV2Relation, _, _, _, _, _) + if i.query.resolved && i.userSpecifiedCols.isEmpty => // ifPartitionNotExists is append with validation, but validation is not supported if (i.ifPartitionNotExists) { throw QueryCompilationErrors.unsupportedIfNotExistsError(r.table.name) @@ -3107,6 +3109,46 @@ class Analyzer(override val catalogManager: CatalogManager) } } + object ResolveUserSpecifiedColumns extends Rule[LogicalPlan] { + override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { + case i: InsertIntoStatement if i.table.resolved && i.query.resolved && + i.userSpecifiedCols.nonEmpty => + val resolved = resolveUserSpecifiedColumns(i) + val projection = addColumnListOnQuery(i.table.output, resolved, i.query) + i.copy(userSpecifiedCols = Nil, query = projection) + } + + private def resolveUserSpecifiedColumns(i: InsertIntoStatement): Seq[NamedExpression] = { + SchemaUtils.checkColumnNameDuplication( + i.userSpecifiedCols, "in the column list", resolver) + + i.userSpecifiedCols.map { col => + i.table.resolve(Seq(col), resolver) + .getOrElse(i.table.failAnalysis(s"Cannot resolve column name $col")) + } + } + + private def addColumnListOnQuery( + tableOutput: Seq[Attribute], + cols: Seq[NamedExpression], + query: LogicalPlan): LogicalPlan = { + if (cols.size != query.output.size) { + query.failAnalysis( + s"Cannot write to table due to mismatched user specified column size(${cols.size}) and" + + s" data column size(${query.output.size})") + } + val nameToQueryExpr = cols.zip(query.output).toMap + // Static partition columns in the table output should not appear in the column list + // they will be handled in another rule ResolveInsertInto + val reordered = tableOutput.flatMap { nameToQueryExpr.get(_).orElse(None) } + if (reordered == query.output) { + query + } else { + Project(reordered, query) + } + } + } + private def validateStoreAssignmentPolicy(): Unit = { // SPARK-28730: LEGACY store assignment policy is disallowed in data source v2. if (conf.storeAssignmentPolicy == StoreAssignmentPolicy.LEGACY) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 9a3ab4a5f8d11..7f89c130749f4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -108,7 +108,7 @@ trait CheckAnalysis extends PredicateHelper { case u: UnresolvedRelation => u.failAnalysis(s"Table or view not found: ${u.multipartIdentifier.quoted}") - case InsertIntoStatement(u: UnresolvedRelation, _, _, _, _) => + case InsertIntoStatement(u: UnresolvedRelation, _, _, _, _, _) => failAnalysis(s"Table not found: ${u.multipartIdentifier.quoted}") // TODO (SPARK-27484): handle streaming write commands when we have them. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index 4cd649b07a5c0..89cf97e76d798 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -431,7 +431,7 @@ package object dsl { partition: Map[String, Option[String]] = Map.empty, overwrite: Boolean = false, ifPartitionNotExists: Boolean = false): LogicalPlan = - InsertIntoStatement(table, partition, logicalPlan, overwrite, ifPartitionNotExists) + InsertIntoStatement(table, partition, Nil, logicalPlan, overwrite, ifPartitionNotExists) def as(alias: String): LogicalPlan = SubqueryAlias(alias, logicalPlan) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index afef88f7e97e8..e85a3eba85377 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -243,9 +243,9 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg /** * Parameters used for writing query to a table: - * (multipartIdentifier, partitionKeys, ifPartitionNotExists). + * (multipartIdentifier, tableColumnList, partitionKeys, ifPartitionNotExists). */ - type InsertTableParams = (Seq[String], Map[String, Option[String]], Boolean) + type InsertTableParams = (Seq[String], Seq[String], Map[String, Option[String]], Boolean) /** * Parameters used for writing query to a directory: (isLocal, CatalogStorageFormat, provider). @@ -255,8 +255,8 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg /** * Add an * {{{ - * INSERT OVERWRITE TABLE tableIdentifier [partitionSpec [IF NOT EXISTS]]? - * INSERT INTO [TABLE] tableIdentifier [partitionSpec] + * INSERT OVERWRITE TABLE tableIdentifier [partitionSpec [IF NOT EXISTS]]? [identifierList] + * INSERT INTO [TABLE] tableIdentifier [partitionSpec] [identifierList] * INSERT OVERWRITE [LOCAL] DIRECTORY STRING [rowFormat] [createFileFormat] * INSERT OVERWRITE [LOCAL] DIRECTORY [STRING] tableProvider [OPTIONS tablePropertyList] * }}} @@ -267,18 +267,20 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg query: LogicalPlan): LogicalPlan = withOrigin(ctx) { ctx match { case table: InsertIntoTableContext => - val (tableIdent, partition, ifPartitionNotExists) = visitInsertIntoTable(table) + val (tableIdent, cols, partition, ifPartitionNotExists) = visitInsertIntoTable(table) InsertIntoStatement( UnresolvedRelation(tableIdent), partition, + cols, query, overwrite = false, ifPartitionNotExists) case table: InsertOverwriteTableContext => - val (tableIdent, partition, ifPartitionNotExists) = visitInsertOverwriteTable(table) + val (tableIdent, cols, partition, ifPartitionNotExists) = visitInsertOverwriteTable(table) InsertIntoStatement( UnresolvedRelation(tableIdent), partition, + cols, query, overwrite = true, ifPartitionNotExists) @@ -299,13 +301,14 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg override def visitInsertIntoTable( ctx: InsertIntoTableContext): InsertTableParams = withOrigin(ctx) { val tableIdent = visitMultipartIdentifier(ctx.multipartIdentifier) + val cols = Option(ctx.identifierList()).map(visitIdentifierList).getOrElse(Nil) val partitionKeys = Option(ctx.partitionSpec).map(visitPartitionSpec).getOrElse(Map.empty) if (ctx.EXISTS != null) { operationNotAllowed("INSERT INTO ... IF NOT EXISTS", ctx) } - (tableIdent, partitionKeys, false) + (tableIdent, cols, partitionKeys, false) } /** @@ -315,6 +318,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg ctx: InsertOverwriteTableContext): InsertTableParams = withOrigin(ctx) { assert(ctx.OVERWRITE() != null) val tableIdent = visitMultipartIdentifier(ctx.multipartIdentifier) + val cols = Option(ctx.identifierList()).map(visitIdentifierList).getOrElse(Nil) val partitionKeys = Option(ctx.partitionSpec).map(visitPartitionSpec).getOrElse(Map.empty) val dynamicPartitionKeys: Map[String, Option[String]] = partitionKeys.filter(_._2.isEmpty) @@ -323,7 +327,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg dynamicPartitionKeys.keys.mkString(", "), ctx) } - (tableIdent, partitionKeys, ctx.EXISTS() != null) + (tableIdent, cols, partitionKeys, ctx.EXISTS() != null) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala index 281d57b3648f4..d5f739466a802 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala @@ -357,6 +357,7 @@ case class DropViewStatement( * An INSERT INTO statement, as parsed from SQL. * * @param table the logical plan representing the table. + * @param userSpecifiedCols the user specified list of columns that belong to the table. * @param query the logical plan representing data to write to. * @param overwrite overwrite existing table or partitions. * @param partitionSpec a map from the partition key to the partition value (optional). @@ -371,6 +372,7 @@ case class DropViewStatement( case class InsertIntoStatement( table: LogicalPlan, partitionSpec: Map[String, Option[String]], + userSpecifiedCols: Seq[String], query: LogicalPlan, overwrite: Boolean, ifPartitionNotExists: Boolean) extends ParsedStatement { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala index c58ff81f17131..91b35bcac98ae 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala @@ -1172,6 +1172,22 @@ class DDLParserSuite extends AnalysisTest { InsertIntoStatement( UnresolvedRelation(Seq("testcat", "ns1", "ns2", "tbl")), Map.empty, + Nil, + Project(Seq(UnresolvedStar(None)), UnresolvedRelation(Seq("source"))), + overwrite = false, ifPartitionNotExists = false)) + } + } + + test("insert table: basic append with a column list") { + Seq( + "INSERT INTO TABLE testcat.ns1.ns2.tbl (a, b) SELECT * FROM source", + "INSERT INTO testcat.ns1.ns2.tbl (a, b) SELECT * FROM source" + ).foreach { sql => + parseCompare(sql, + InsertIntoStatement( + UnresolvedRelation(Seq("testcat", "ns1", "ns2", "tbl")), + Map.empty, + Seq("a", "b"), Project(Seq(UnresolvedStar(None)), UnresolvedRelation(Seq("source"))), overwrite = false, ifPartitionNotExists = false)) } @@ -1182,6 +1198,7 @@ class DDLParserSuite extends AnalysisTest { InsertIntoStatement( UnresolvedRelation(Seq("testcat", "ns1", "ns2", "tbl")), Map.empty, + Nil, Project(Seq(UnresolvedStar(None)), UnresolvedRelation(Seq("testcat2", "db", "tbl"))), overwrite = false, ifPartitionNotExists = false)) } @@ -1196,6 +1213,22 @@ class DDLParserSuite extends AnalysisTest { InsertIntoStatement( UnresolvedRelation(Seq("testcat", "ns1", "ns2", "tbl")), Map("p1" -> Some("3"), "p2" -> None), + Nil, + Project(Seq(UnresolvedStar(None)), UnresolvedRelation(Seq("source"))), + overwrite = false, ifPartitionNotExists = false)) + } + + test("insert table: append with partition and a column list") { + parseCompare( + """ + |INSERT INTO testcat.ns1.ns2.tbl + |PARTITION (p1 = 3, p2) (a, b) + |SELECT * FROM source + """.stripMargin, + InsertIntoStatement( + UnresolvedRelation(Seq("testcat", "ns1", "ns2", "tbl")), + Map("p1" -> Some("3"), "p2" -> None), + Seq("a", "b"), Project(Seq(UnresolvedStar(None)), UnresolvedRelation(Seq("source"))), overwrite = false, ifPartitionNotExists = false)) } @@ -1209,6 +1242,22 @@ class DDLParserSuite extends AnalysisTest { InsertIntoStatement( UnresolvedRelation(Seq("testcat", "ns1", "ns2", "tbl")), Map.empty, + Nil, + Project(Seq(UnresolvedStar(None)), UnresolvedRelation(Seq("source"))), + overwrite = true, ifPartitionNotExists = false)) + } + } + + test("insert table: overwrite with column list") { + Seq( + "INSERT OVERWRITE TABLE testcat.ns1.ns2.tbl (a, b) SELECT * FROM source", + "INSERT OVERWRITE testcat.ns1.ns2.tbl (a, b) SELECT * FROM source" + ).foreach { sql => + parseCompare(sql, + InsertIntoStatement( + UnresolvedRelation(Seq("testcat", "ns1", "ns2", "tbl")), + Map.empty, + Seq("a", "b"), Project(Seq(UnresolvedStar(None)), UnresolvedRelation(Seq("source"))), overwrite = true, ifPartitionNotExists = false)) } @@ -1224,6 +1273,22 @@ class DDLParserSuite extends AnalysisTest { InsertIntoStatement( UnresolvedRelation(Seq("testcat", "ns1", "ns2", "tbl")), Map("p1" -> Some("3"), "p2" -> None), + Nil, + Project(Seq(UnresolvedStar(None)), UnresolvedRelation(Seq("source"))), + overwrite = true, ifPartitionNotExists = false)) + } + + test("insert table: overwrite with partition and column list") { + parseCompare( + """ + |INSERT OVERWRITE TABLE testcat.ns1.ns2.tbl + |PARTITION (p1 = 3, p2) (a, b) + |SELECT * FROM source + """.stripMargin, + InsertIntoStatement( + UnresolvedRelation(Seq("testcat", "ns1", "ns2", "tbl")), + Map("p1" -> Some("3"), "p2" -> None), + Seq("a", "b"), Project(Seq(UnresolvedStar(None)), UnresolvedRelation(Seq("source"))), overwrite = true, ifPartitionNotExists = false)) } @@ -1238,6 +1303,7 @@ class DDLParserSuite extends AnalysisTest { InsertIntoStatement( UnresolvedRelation(Seq("testcat", "ns1", "ns2", "tbl")), Map("p1" -> Some("3")), + Nil, Project(Seq(UnresolvedStar(None)), UnresolvedRelation(Seq("source"))), overwrite = true, ifPartitionNotExists = true)) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala index 88afcb10d9c20..6fef18babedb6 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala @@ -295,7 +295,7 @@ class PlanParserSuite extends AnalysisTest { partition: Map[String, Option[String]], overwrite: Boolean = false, ifPartitionNotExists: Boolean = false): LogicalPlan = - InsertIntoStatement(table("s"), partition, plan, overwrite, ifPartitionNotExists) + InsertIntoStatement(table("s"), partition, Nil, plan, overwrite, ifPartitionNotExists) // Single inserts assertEqual(s"insert overwrite table s $sql", @@ -713,7 +713,7 @@ class PlanParserSuite extends AnalysisTest { comparePlans( parsePlan( "INSERT INTO s SELECT /*+ REPARTITION(100), COALESCE(500), COALESCE(10) */ * FROM t"), - InsertIntoStatement(table("s"), Map.empty, + InsertIntoStatement(table("s"), Map.empty, Nil, UnresolvedHint("REPARTITION", Seq(Literal(100)), UnresolvedHint("COALESCE", Seq(Literal(500)), UnresolvedHint("COALESCE", Seq(Literal(10)), diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index a8688bdf15495..c5f2a3d568e97 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -536,6 +536,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { InsertIntoStatement( table = UnresolvedRelation(tableIdent), partitionSpec = Map.empty[String, Option[String]], + Nil, query = df.logicalPlan, overwrite = mode == SaveMode.Overwrite, ifPartitionNotExists = false) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 361d1fab03421..e4f001d61a767 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -156,7 +156,7 @@ object DataSourceAnalysis extends Rule[LogicalPlan] with CastSupport { CreateDataSourceTableAsSelectCommand(tableDesc, mode, query, query.output.map(_.name)) case InsertIntoStatement(l @ LogicalRelation(_: InsertableRelation, _, _, _), - parts, query, overwrite, false) if parts.isEmpty => + parts, _, query, overwrite, false) if parts.isEmpty => InsertIntoDataSourceCommand(l, query, overwrite) case InsertIntoDir(_, storage, provider, query, overwrite) @@ -168,7 +168,7 @@ object DataSourceAnalysis extends Rule[LogicalPlan] with CastSupport { InsertIntoDataSourceDirCommand(storage, provider.get, query, overwrite) case i @ InsertIntoStatement( - l @ LogicalRelation(t: HadoopFsRelation, _, table, _), parts, query, overwrite, _) => + l @ LogicalRelation(t: HadoopFsRelation, _, table, _), parts, _, query, overwrite, _) => // If the InsertIntoTable command is for a partitioned HadoopFsRelation and // the user has specified static partitions, we add a Project operator on top of the query // to include those constant column values in the query result. @@ -276,11 +276,11 @@ class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan] override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { - case i @ InsertIntoStatement(UnresolvedCatalogRelation(tableMeta, options, false), _, _, _, _) - if DDLUtils.isDatasourceTable(tableMeta) => + case i @ InsertIntoStatement(UnresolvedCatalogRelation(tableMeta, options, false), + _, _, _, _, _) if DDLUtils.isDatasourceTable(tableMeta) => i.copy(table = readDataSourceTable(tableMeta, options)) - case i @ InsertIntoStatement(UnresolvedCatalogRelation(tableMeta, _, false), _, _, _, _) => + case i @ InsertIntoStatement(UnresolvedCatalogRelation(tableMeta, _, false), _, _, _, _, _) => i.copy(table = DDLUtils.readHiveTable(tableMeta)) case UnresolvedCatalogRelation(tableMeta, options, false) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FallBackFileSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FallBackFileSourceV2.scala index 1149767bdade2..b5d06db024112 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FallBackFileSourceV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FallBackFileSourceV2.scala @@ -34,8 +34,8 @@ import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, File */ class FallBackFileSourceV2(sparkSession: SparkSession) extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { - case i @ - InsertIntoStatement(d @ DataSourceV2Relation(table: FileTable, _, _, _, _), _, _, _, _) => + case i @ InsertIntoStatement( + d @ DataSourceV2Relation(table: FileTable, _, _, _, _), _, _, _, _, _) => val v1FileFormat = table.fallbackFileFormat.newInstance() val relation = HadoopFsRelation( table.fileIndex, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index 9e65b0ce13693..2cc78258378ab 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -434,7 +434,7 @@ object PreprocessTableInsertion extends Rule[LogicalPlan] { } def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { - case i @ InsertIntoStatement(table, _, query, _, _) if table.resolved && query.resolved => + case i @ InsertIntoStatement(table, _, _, query, _, _) if table.resolved && query.resolved => table match { case relation: HiveTableRelation => val metadata = relation.tableMeta @@ -512,7 +512,7 @@ object PreWriteCheck extends (LogicalPlan => Unit) { def apply(plan: LogicalPlan): Unit = { plan.foreach { - case InsertIntoStatement(l @ LogicalRelation(relation, _, _, _), partition, query, _, _) => + case InsertIntoStatement(l @ LogicalRelation(relation, _, _, _), partition, _, query, _, _) => // Get all input data source relations of the query. val srcRelations = query.collect { case LogicalRelation(src, _, _, _) => src @@ -534,7 +534,7 @@ object PreWriteCheck extends (LogicalPlan => Unit) { case _ => failAnalysis(s"$relation does not allow insertion.") } - case InsertIntoStatement(t, _, _, _, _) + case InsertIntoStatement(t, _, _, _, _, _) if !t.isInstanceOf[LeafNode] || t.isInstanceOf[Range] || t.isInstanceOf[OneRowRelation] || diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLInsertTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLInsertTestSuite.scala new file mode 100644 index 0000000000000..e454f0e6d540f --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLInsertTestSuite.scala @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.SparkConf +import org.apache.spark.sql.connector.InMemoryPartitionTableCatalog +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} + +/** + * The base trait for DML - insert syntax + */ +trait SQLInsertTestSuite extends QueryTest with SQLTestUtils { + + import testImplicits._ + + def format: String + + protected def createTable( + table: String, + cols: Seq[String], + colTypes: Seq[String], + partCols: Seq[String] = Nil): Unit = { + val values = cols.zip(colTypes).map(tuple => tuple._1 + " " + tuple._2).mkString("(", ", ", ")") + val partitionSpec = if (partCols.nonEmpty) { + partCols.mkString("PARTITIONED BY (", ",", ")") + } else "" + sql(s"CREATE TABLE $table$values USING $format $partitionSpec") + } + + protected def processInsert( + tableName: String, + input: DataFrame, + cols: Seq[String] = Nil, + partitionExprs: Seq[String] = Nil, + overwrite: Boolean): Unit = { + val tmpView = "tmp_view" + val columnList = if (cols.nonEmpty) cols.mkString("(", ",", ")") else "" + val partitionList = if (partitionExprs.nonEmpty) { + partitionExprs.mkString("PARTITION (", ",", ")") + } else "" + withTempView(tmpView) { + input.createOrReplaceTempView(tmpView) + val overwriteStr = if (overwrite) "OVERWRITE" else "INTO" + sql( + s"INSERT $overwriteStr TABLE $tableName $partitionList $columnList SELECT * FROM $tmpView") + } + } + + protected def verifyTable(tableName: String, expected: DataFrame): Unit = { + checkAnswer(spark.table(tableName), expected) + } + + test("insert with column list - follow table output order") { + withTable("t1") { + val df = Seq((1, 2L, "3")).toDF() + val cols = Seq("c1", "c2", "c3") + createTable("t1", cols, Seq("int", "long", "string")) + Seq(false, true).foreach { m => + processInsert("t1", df, cols, overwrite = m) + verifyTable("t1", df) + } + } + } + + test("insert with column list - follow table output order + partitioned table") { + val cols = Seq("c1", "c2", "c3", "c4") + val df = Seq((1, 2, 3, 4)).toDF(cols: _*) + withTable("t1") { + createTable("t1", cols, Seq("int", "int", "int", "int"), cols.takeRight(2)) + Seq(false, true).foreach { m => + processInsert("t1", df, cols, overwrite = m) + verifyTable("t1", df) + } + } + + withTable("t1") { + createTable("t1", cols, Seq("int", "int", "int", "int"), cols.takeRight(2)) + Seq(false, true).foreach { m => + processInsert( + "t1", df.selectExpr("c1", "c2"), cols.take(2), Seq("c3=3", "c4=4"), overwrite = m) + verifyTable("t1", df) + } + } + + withTable("t1") { + createTable("t1", cols, Seq("int", "int", "int", "int"), cols.takeRight(2)) + Seq(false, true).foreach { m => + processInsert("t1", df.selectExpr("c1", "c2", "c4"), + cols.filterNot(_ == "c3"), Seq("c3=3", "c4"), overwrite = m) + verifyTable("t1", df) + } + } + } + + test("insert with column list - table output reorder") { + withTable("t1") { + val cols = Seq("c1", "c2", "c3") + val df = Seq((1, 2, 3)).toDF(cols: _*) + createTable("t1", cols, Seq("int", "int", "int")) + Seq(false, true).foreach { m => + processInsert("t1", df, cols.reverse, overwrite = m) + verifyTable("t1", df.selectExpr(cols.reverse: _*)) + } + } + } + + test("insert with column list - table output reorder + partitioned table") { + val cols = Seq("c1", "c2", "c3", "c4") + val df = Seq((1, 2, 3, 4)).toDF(cols: _*) + withTable("t1") { + createTable("t1", cols, Seq("int", "int", "int", "int"), cols.takeRight(2)) + Seq(false, true).foreach { m => + processInsert("t1", df, cols.reverse, overwrite = m) + verifyTable("t1", df.selectExpr(cols.reverse: _*)) + } + } + + withTable("t1") { + createTable("t1", cols, Seq("int", "int", "int", "int"), cols.takeRight(2)) + Seq(false, true).foreach { m => + processInsert( + "t1", df.selectExpr("c1", "c2"), cols.take(2).reverse, Seq("c3=3", "c4=4"), overwrite = m) + verifyTable("t1", df.selectExpr("c2", "c1", "c3", "c4")) + } + } + + withTable("t1") { + createTable("t1", cols, Seq("int", "int", "int", "int"), cols.takeRight(2)) + Seq(false, true).foreach { m => + processInsert("t1", + df.selectExpr("c1", "c2", "c4"), Seq("c4", "c2", "c1"), Seq("c3=3", "c4"), overwrite = m) + verifyTable("t1", df.selectExpr("c4", "c2", "c3", "c1")) + } + } + } + + test("insert with column list - duplicated columns") { + withTable("t1") { + val cols = Seq("c1", "c2", "c3") + createTable("t1", cols, Seq("int", "long", "string")) + val e1 = intercept[AnalysisException](sql(s"INSERT INTO t1 (c1, c2, c2) values(1, 2, 3)")) + assert(e1.getMessage === "Found duplicate column(s) in the column list: `c2`;") + } + } + + test("insert with column list - invalid columns") { + withTable("t1") { + val cols = Seq("c1", "c2", "c3") + createTable("t1", cols, Seq("int", "long", "string")) + val e1 = intercept[AnalysisException](sql(s"INSERT INTO t1 (c1, c2, c4) values(1, 2, 3)")) + assert(e1.getMessage === "Cannot resolve column name c4;") + } + } + + test("insert with column list - mismatched column list size") { + val msg = "Cannot write to table due to mismatched user specified column size" + withTable("t1") { + val cols = Seq("c1", "c2", "c3") + createTable("t1", cols, Seq("int", "long", "string")) + val e1 = intercept[AnalysisException](sql(s"INSERT INTO t1 (c1, c2) values(1, 2, 3)")) + assert(e1.getMessage.contains(msg)) + val e2 = intercept[AnalysisException](sql(s"INSERT INTO t1 (c1, c2, c3) values(1, 2)")) + assert(e2.getMessage.contains(msg)) + } + } + + test("insert with column list - mismatched target table out size after rewritten query") { + val v2Msg = "Cannot write to 'testcat.t1', not enough data columns:" + val cols = Seq("c1", "c2", "c3", "c4") + + withTable("t1") { + createTable("t1", cols, Seq.fill(4)("int")) + val e1 = intercept[AnalysisException](sql(s"INSERT INTO t1 (c1) values(1)")) + assert(e1.getMessage.contains("target table has 4 column(s) but the inserted data has 1") || + e1.getMessage.contains(v2Msg)) + } + + withTable("t1") { + createTable("t1", cols, Seq.fill(4)("int"), cols.takeRight(2)) + val e1 = intercept[AnalysisException] { + sql(s"INSERT INTO t1 partition(c3=3, c4=4) (c1) values(1)") + } + assert(e1.getMessage.contains("target table has 4 column(s) but the inserted data has 3") || + e1.getMessage.contains(v2Msg)) + } + } +} + +class FileSourceSQLInsertTestSuite extends SQLInsertTestSuite with SharedSparkSession { + override def format: String = "parquet" + override protected def sparkConf: SparkConf = { + super.sparkConf.set(SQLConf.USE_V1_SOURCE_LIST, format) + } +} + +class DSV2SQLInsertTestSuite extends SQLInsertTestSuite with SharedSparkSession { + + override def format: String = "foo" + + protected override def sparkConf: SparkConf = { + super.sparkConf + .set("spark.sql.catalog.testcat", classOf[InMemoryPartitionTableCatalog].getName) + .set(SQLConf.DEFAULT_CATALOG.key, "testcat") + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala index 92c114e116d0c..9710fca6bc82c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala @@ -1185,7 +1185,7 @@ class PlanResolutionSuite extends AnalysisTest { case Project(_, AsDataSourceV2Relation(r)) => assert(r.catalog.exists(_ == catlogIdent)) assert(r.identifier.exists(_.name() == tableIdent)) - case InsertIntoStatement(r: DataSourceV2Relation, _, _, _, _) => + case InsertIntoStatement(r: DataSourceV2Relation, _, _, _, _, _) => assert(r.catalog.exists(_ == catlogIdent)) assert(r.identifier.exists(_.name() == tableIdent)) case DescribeRelation(r: ResolvedTable, _, _) => diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala index 3d8bba8b1b425..ff7dc58829fa1 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala @@ -145,7 +145,7 @@ class DetermineTableStats(session: SparkSession) extends Rule[LogicalPlan] { // handles InsertIntoStatement specially as the table in InsertIntoStatement is not added in its // children, hence not matched directly by previous HiveTableRelation case. - case i @ InsertIntoStatement(relation: HiveTableRelation, _, _, _, _) + case i @ InsertIntoStatement(relation: HiveTableRelation, _, _, _, _, _) if DDLUtils.isHiveTable(relation.tableMeta) && relation.tableMeta.stats.isEmpty => i.copy(table = hiveTableWithStats(relation)) } @@ -159,7 +159,8 @@ class DetermineTableStats(session: SparkSession) extends Rule[LogicalPlan] { */ object HiveAnalysis extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { - case InsertIntoStatement(r: HiveTableRelation, partSpec, query, overwrite, ifPartitionNotExists) + case InsertIntoStatement( + r: HiveTableRelation, partSpec, _, query, overwrite, ifPartitionNotExists) if DDLUtils.isHiveTable(r.tableMeta) => InsertIntoHiveTable(r.tableMeta, partSpec, query, overwrite, ifPartitionNotExists, query.output.map(_.name)) @@ -207,11 +208,11 @@ case class RelationConversions( plan resolveOperators { // Write path case InsertIntoStatement( - r: HiveTableRelation, partition, query, overwrite, ifPartitionNotExists) + r: HiveTableRelation, partition, cols, query, overwrite, ifPartitionNotExists) if query.resolved && DDLUtils.isHiveTable(r.tableMeta) && (!r.isPartitioned || SQLConf.get.getConf(HiveUtils.CONVERT_INSERTING_PARTITIONED_TABLE)) && isConvertible(r) => - InsertIntoStatement(metastoreCatalog.convert(r), partition, + InsertIntoStatement(metastoreCatalog.convert(r), partition, cols, query, overwrite, ifPartitionNotExists) // Read path diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSQLInsertTestSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSQLInsertTestSuite.scala new file mode 100644 index 0000000000000..49b005bca938e --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSQLInsertTestSuite.scala @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + +import org.apache.spark.sql.SQLInsertTestSuite +import org.apache.spark.sql.hive.test.TestHiveSingleton + +class HiveSQLInsertTestSuite extends SQLInsertTestSuite with TestHiveSingleton { + override def format: String = "hive OPTIONS(fileFormat='parquet')" +} From 0fd9f57dd4cee32b4d0a16345f98e628a9d5f0fe Mon Sep 17 00:00:00 2001 From: Terry Kim Date: Mon, 30 Nov 2020 05:37:10 +0000 Subject: [PATCH 044/150] [SPARK-33448][SQL] Support CACHE/UNCACHE TABLE commands for v2 tables ### What changes were proposed in this pull request? This PR proposes to support `CHACHE/UNCACHE TABLE` commands for v2 tables. In addtion, this PR proposes to migrate `CACHE/UNCACHE TABLE` to use `UnresolvedTableOrView` to resolve the table identifier. This allows consistent resolution rules (temp view first, etc.) to be applied for both v1/v2 commands. More info about the consistent resolution rule proposal can be found in [JIRA](https://issues.apache.org/jira/browse/SPARK-29900) or [proposal doc](https://docs.google.com/document/d/1hvLjGA8y_W_hhilpngXVub1Ebv8RsMap986nENCFnrg/edit?usp=sharing). ### Why are the changes needed? To support `CACHE/UNCACHE TABLE` commands for v2 tables. Note that `CACHE/UNCACHE TABLE` for v1 tables/views go through `SparkSession.table` to resolve identifier, which resolves temp views first, so there is no change in the behavior by moving to the new framework. ### Does this PR introduce _any_ user-facing change? Yes. Now the user can run `CACHE/UNCACHE TABLE` commands on v2 tables. ### How was this patch tested? Added/updated existing tests. Closes #30403 from imback82/cache_table. Authored-by: Terry Kim Signed-off-by: Wenchen Fan --- .../sql/catalyst/parser/AstBuilder.scala | 31 ------------- .../catalyst/plans/logical/statements.scala | 16 ------- .../sql/catalyst/parser/DDLParserSuite.scala | 27 ----------- .../analysis/ResolveSessionCatalog.scala | 19 +------- .../spark/sql/execution/SparkSqlParser.scala | 34 ++++++++++++++ .../spark/sql/execution/command/cache.scala | 43 +++++++++++------- .../apache/spark/sql/CachedTableSuite.scala | 11 +++++ .../sql/connector/DataSourceV2SQLSuite.scala | 40 ++++++++++------- .../sql/execution/SparkSqlParserSuite.scala | 45 ++++++++++++++++++- .../execution/metric/SQLMetricsSuite.scala | 2 +- .../HiveThriftServer2Suites.scala | 4 +- .../spark/sql/hive/CachedTableSuite.scala | 14 +++--- .../apache/spark/sql/hive/test/TestHive.scala | 2 +- 13 files changed, 152 insertions(+), 136 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index e85a3eba85377..a31d7ca7268a6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -3590,37 +3590,6 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg ctx.SERDE != null) } - /** - * Create a [[CacheTableStatement]]. - * - * For example: - * {{{ - * CACHE [LAZY] TABLE multi_part_name - * [OPTIONS tablePropertyList] [[AS] query] - * }}} - */ - override def visitCacheTable(ctx: CacheTableContext): LogicalPlan = withOrigin(ctx) { - import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ - - val query = Option(ctx.query).map(plan) - val tableName = visitMultipartIdentifier(ctx.multipartIdentifier) - if (query.isDefined && tableName.length > 1) { - val catalogAndNamespace = tableName.init - throw new ParseException("It is not allowed to add catalog/namespace " + - s"prefix ${catalogAndNamespace.quoted} to " + - "the table name in CACHE TABLE AS SELECT", ctx) - } - val options = Option(ctx.options).map(visitPropertyKeyValues).getOrElse(Map.empty) - CacheTableStatement(tableName, query, ctx.LAZY != null, options) - } - - /** - * Create an [[UncacheTableStatement]] logical plan. - */ - override def visitUncacheTable(ctx: UncacheTableContext): LogicalPlan = withOrigin(ctx) { - UncacheTableStatement(visitMultipartIdentifier(ctx.multipartIdentifier), ctx.EXISTS != null) - } - /** * Create a [[TruncateTable]] command. * diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala index d5f739466a802..effb4cff75930 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala @@ -412,22 +412,6 @@ case class UseStatement(isNamespaceSet: Boolean, nameParts: Seq[String]) extends */ case class RepairTableStatement(tableName: Seq[String]) extends ParsedStatement -/** - * A CACHE TABLE statement, as parsed from SQL - */ -case class CacheTableStatement( - tableName: Seq[String], - plan: Option[LogicalPlan], - isLazy: Boolean, - options: Map[String, String]) extends ParsedStatement - -/** - * An UNCACHE TABLE statement, as parsed from SQL - */ -case class UncacheTableStatement( - tableName: Seq[String], - ifExists: Boolean) extends ParsedStatement - /** * A TRUNCATE TABLE statement, as parsed from SQL */ diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala index 91b35bcac98ae..0f1b4a3ea918c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala @@ -1984,33 +1984,6 @@ class DDLParserSuite extends AnalysisTest { asSerde = true)) } - test("CACHE TABLE") { - comparePlans( - parsePlan("CACHE TABLE a.b.c"), - CacheTableStatement(Seq("a", "b", "c"), None, false, Map.empty)) - - comparePlans( - parsePlan("CACHE LAZY TABLE a.b.c"), - CacheTableStatement(Seq("a", "b", "c"), None, true, Map.empty)) - - comparePlans( - parsePlan("CACHE LAZY TABLE a.b.c OPTIONS('storageLevel' 'DISK_ONLY')"), - CacheTableStatement(Seq("a", "b", "c"), None, true, Map("storageLevel" -> "DISK_ONLY"))) - - intercept("CACHE TABLE a.b.c AS SELECT * FROM testData", - "It is not allowed to add catalog/namespace prefix a.b") - } - - test("UNCACHE TABLE") { - comparePlans( - parsePlan("UNCACHE TABLE a.b.c"), - UncacheTableStatement(Seq("a", "b", "c"), ifExists = false)) - - comparePlans( - parsePlan("UNCACHE TABLE IF EXISTS a.b.c"), - UncacheTableStatement(Seq("a", "b", "c"), ifExists = true)) - } - test("TRUNCATE table") { comparePlans( parsePlan("TRUNCATE TABLE a.b.c"), diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index f49caf7f04a20..582f11a2be8fa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -446,20 +446,6 @@ class ResolveSessionCatalog( ShowCreateTableCommand(ident.asTableIdentifier) } - case CacheTableStatement(tbl, plan, isLazy, options) => - val name = if (plan.isDefined) { - // CACHE TABLE ... AS SELECT creates a temp view with the input query. - // Temp view doesn't belong to any catalog and we shouldn't resolve catalog in the name. - tbl - } else { - parseTempViewOrV1Table(tbl, "CACHE TABLE") - } - CacheTableCommand(name.asTableIdentifier, plan, isLazy, options) - - case UncacheTableStatement(tbl, ifExists) => - val name = parseTempViewOrV1Table(tbl, "UNCACHE TABLE") - UncacheTableCommand(name.asTableIdentifier, ifExists) - case TruncateTable(ResolvedV1TableIdentifier(ident), partitionSpec) => TruncateTableCommand( ident.asTableIdentifier, @@ -561,12 +547,9 @@ class ResolveSessionCatalog( "SHOW VIEWS, only SessionCatalog supports this command.") } - case ShowTableProperties(ResolvedV1TableIdentifier(ident), propertyKey) => + case ShowTableProperties(ResolvedV1TableOrViewIdentifier(ident), propertyKey) => ShowTablePropertiesCommand(ident.asTableIdentifier, propertyKey) - case ShowTableProperties(r: ResolvedView, propertyKey) => - ShowTablePropertiesCommand(r.identifier.asTableIdentifier, propertyKey) - case DescribeFunction(ResolvedFunc(identifier), extended) => DescribeFunctionCommand(identifier.asFunctionIdentifier, extended) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 568c7112954f5..c82e3818b48cc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -192,6 +192,40 @@ class SparkSqlAstBuilder extends AstBuilder { unquotedPath } + /** + * Create a [[CacheTableCommand]]. + * + * For example: + * {{{ + * CACHE [LAZY] TABLE multi_part_name + * [OPTIONS tablePropertyList] [[AS] query] + * }}} + */ + override def visitCacheTable(ctx: CacheTableContext): LogicalPlan = withOrigin(ctx) { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + + val query = Option(ctx.query).map(plan) + val tableName = visitMultipartIdentifier(ctx.multipartIdentifier) + if (query.isDefined && tableName.length > 1) { + val catalogAndNamespace = tableName.init + throw new ParseException("It is not allowed to add catalog/namespace " + + s"prefix ${catalogAndNamespace.quoted} to " + + "the table name in CACHE TABLE AS SELECT", ctx) + } + val options = Option(ctx.options).map(visitPropertyKeyValues).getOrElse(Map.empty) + CacheTableCommand(tableName, query, ctx.LAZY != null, options) + } + + + /** + * Create an [[UncacheTableCommand]] logical plan. + */ + override def visitUncacheTable(ctx: UncacheTableContext): LogicalPlan = withOrigin(ctx) { + UncacheTableCommand( + visitMultipartIdentifier(ctx.multipartIdentifier), + ctx.EXISTS != null) + } + /** * Create a [[ClearCacheCommand]] logical plan. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala index f99dc8d9f1a8e..3f0945d1e817b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala @@ -19,26 +19,27 @@ package org.apache.spark.sql.execution.command import java.util.Locale -import org.apache.spark.sql.{Dataset, Row, SparkSession} -import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{IgnoreCachedData, LogicalPlan} import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.MultipartIdentifierHelper import org.apache.spark.storage.StorageLevel case class CacheTableCommand( - tableIdent: TableIdentifier, + multipartIdentifier: Seq[String], plan: Option[LogicalPlan], isLazy: Boolean, options: Map[String, String]) extends RunnableCommand { - require(plan.isEmpty || tableIdent.database.isEmpty, - "Database name is not allowed in CACHE TABLE AS SELECT") + require(plan.isEmpty || multipartIdentifier.length == 1, + "Namespace name is not allowed in CACHE TABLE AS SELECT") override def innerChildren: Seq[QueryPlan[_]] = plan.toSeq override def run(sparkSession: SparkSession): Seq[Row] = { + val tableName = multipartIdentifier.quoted plan.foreach { logicalPlan => - Dataset.ofRows(sparkSession, logicalPlan).createTempView(tableIdent.quotedString) + Dataset.ofRows(sparkSession, logicalPlan).createTempView(tableName) } val storageLevelKey = "storagelevel" @@ -49,34 +50,46 @@ case class CacheTableCommand( logWarning(s"Invalid options: ${withoutStorageLevel.mkString(", ")}") } + val table = sparkSession.table(tableName) if (storageLevelValue.nonEmpty) { - sparkSession.catalog.cacheTable( - tableIdent.quotedString, StorageLevel.fromString(storageLevelValue.get)) + sparkSession.sharedState.cacheManager.cacheQuery( + table, + Some(tableName), + StorageLevel.fromString(storageLevelValue.get)) } else { - sparkSession.catalog.cacheTable(tableIdent.quotedString) + sparkSession.sharedState.cacheManager.cacheQuery(table, Some(tableName)) } if (!isLazy) { // Performs eager caching - sparkSession.table(tableIdent).count() + table.count() } Seq.empty[Row] } } - case class UncacheTableCommand( - tableIdent: TableIdentifier, + multipartIdentifier: Seq[String], ifExists: Boolean) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { - val tableId = tableIdent.quotedString - if (!ifExists || sparkSession.catalog.tableExists(tableId)) { - sparkSession.catalog.uncacheTable(tableId) + val tableName = multipartIdentifier.quoted + table(sparkSession, tableName).foreach { table => + val cascade = !sparkSession.sessionState.catalog.isTempView(multipartIdentifier) + sparkSession.sharedState.cacheManager.uncacheQuery(table, cascade) } Seq.empty[Row] } + + private def table(sparkSession: SparkSession, name: String): Option[DataFrame] = { + try { + Some(sparkSession.table(name)) + } catch { + case ex: AnalysisException if ifExists && ex.getMessage.contains("Table or view not found") => + None + } + } } /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index 6313370476c93..ef3f4daa6dc6b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala @@ -25,6 +25,7 @@ import org.apache.spark.executor.DataReadMethod._ import org.apache.spark.executor.DataReadMethod.DataReadMethod import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart} import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.analysis.TempTableAlreadyExistsException import org.apache.spark.sql.catalyst.expressions.SubqueryExpression import org.apache.spark.sql.catalyst.plans.logical.{BROADCAST, Join, JoinStrategyHint, SHUFFLE_HASH} import org.apache.spark.sql.catalyst.util.DateTimeConstants @@ -140,6 +141,16 @@ class CachedTableSuite extends QueryTest with SQLTestUtils } } + test("cache table as select - existing temp view") { + withTempView("tempView") { + sql("CREATE TEMPORARY VIEW tempView as SELECT 1") + val e = intercept[TempTableAlreadyExistsException] { + sql("CACHE TABLE tempView AS SELECT 1") + } + assert(e.getMessage.contains("Temporary view 'tempView' already exists")) + } + } + test("uncaching temp table") { withTempView("tempTable1", "tempTable2") { testData.select("key").createOrReplaceTempView("tempTable1") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index 98580568a8df6..ffbc2287d81ad 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.connector.catalog._ import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME import org.apache.spark.sql.connector.catalog.CatalogV2Util.withDefaultOwnership +import org.apache.spark.sql.execution.columnar.InMemoryRelation import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} import org.apache.spark.sql.internal.SQLConf.{PARTITION_OVERWRITE_MODE, PartitionOverwriteMode, V2_SESSION_CATALOG_IMPLEMENTATION} import org.apache.spark.sql.internal.connector.SimpleTableProvider @@ -2018,28 +2019,29 @@ class DataSourceV2SQLSuite } } - test("CACHE TABLE") { + test("CACHE/UNCACHE TABLE") { val t = "testcat.ns1.ns2.tbl" withTable(t) { - spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo") + def isCached(table: String): Boolean = { + spark.table(table).queryExecution.withCachedData.isInstanceOf[InMemoryRelation] + } - testV1CommandSupportingTempView("CACHE TABLE", t) + spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo") + sql(s"CACHE TABLE $t") + assert(isCached(t)) - val e = intercept[AnalysisException] { - sql(s"CACHE LAZY TABLE $t") - } - assert(e.message.contains("CACHE TABLE is only supported with temp views or v1 tables")) + sql(s"UNCACHE TABLE $t") + assert(!isCached(t)) } - } - test("UNCACHE TABLE") { - val t = "testcat.ns1.ns2.tbl" - withTable(t) { - sql(s"CREATE TABLE $t (id bigint, data string) USING foo") - - testV1CommandSupportingTempView("UNCACHE TABLE", t) - testV1CommandSupportingTempView("UNCACHE TABLE", s"IF EXISTS $t") + // Test a scenario where a table does not exist. + val e = intercept[AnalysisException] { + sql(s"UNCACHE TABLE $t") } + assert(e.message.contains("Table or view not found: testcat.ns1.ns2.tbl")) + + // If "IF EXISTS" is set, UNCACHE TABLE will not throw an exception. + sql(s"UNCACHE TABLE IF EXISTS $t") } test("SHOW COLUMNS") { @@ -2555,11 +2557,15 @@ class DataSourceV2SQLSuite } } - private def testNotSupportedV2Command(sqlCommand: String, sqlParams: String): Unit = { + private def testNotSupportedV2Command( + sqlCommand: String, + sqlParams: String, + sqlCommandInMessage: Option[String] = None): Unit = { val e = intercept[AnalysisException] { sql(s"$sqlCommand $sqlParams") } - assert(e.message.contains(s"$sqlCommand is not supported for v2 tables")) + val cmdStr = sqlCommandInMessage.getOrElse(sqlCommand) + assert(e.message.contains(s"$cmdStr is not supported for v2 tables")) } private def testV1Command(sqlCommand: String, sqlParams: String): Unit = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala index 61c16baedb7cc..1a826c00c81f2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala @@ -337,5 +337,48 @@ class SparkSqlParserSuite extends AnalysisTest { |FROM v """.stripMargin, "LINES TERMINATED BY only supports newline '\\n' right now") - } + } + + test("CACHE TABLE") { + assertEqual( + "CACHE TABLE a.b.c", + CacheTableCommand(Seq("a", "b", "c"), None, false, Map.empty)) + + assertEqual( + "CACHE TABLE t AS SELECT * FROM testData", + CacheTableCommand( + Seq("t"), + Some(Project(Seq(UnresolvedStar(None)), UnresolvedRelation(Seq("testData")))), + false, + Map.empty)) + + assertEqual( + "CACHE LAZY TABLE a.b.c", + CacheTableCommand(Seq("a", "b", "c"), None, true, Map.empty)) + + assertEqual( + "CACHE LAZY TABLE a.b.c OPTIONS('storageLevel' 'DISK_ONLY')", + CacheTableCommand( + Seq("a", "b", "c"), + None, + true, + Map("storageLevel" -> "DISK_ONLY"))) + + intercept("CACHE TABLE a.b.c AS SELECT * FROM testData", + "It is not allowed to add catalog/namespace prefix a.b") + } + + test("UNCACHE TABLE") { + assertEqual( + "UNCACHE TABLE a.b.c", + UncacheTableCommand(Seq("a", "b", "c"), ifExists = false)) + + assertEqual( + "UNCACHE TABLE IF EXISTS a.b.c", + UncacheTableCommand(Seq("a", "b", "c"), ifExists = true)) + } + + test("CLEAR CACHE") { + assertEqual("CLEAR CACHE", ClearCacheCommand) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index 4872906dbfec3..b4f921efcac81 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -705,7 +705,7 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils sql("CREATE TEMPORARY VIEW inMemoryTable AS SELECT 1 AS c1") sql("CACHE TABLE inMemoryTable") testSparkPlanMetrics(spark.table("inMemoryTable"), 1, - Map(1L -> (("Scan In-memory table `inMemoryTable`", Map.empty))) + Map(1L -> (("Scan In-memory table inMemoryTable", Map.empty))) ) sql("CREATE TEMPORARY VIEW ```a``b``` AS SELECT 2 AS c1") diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala index 7cc60bb505089..5bf7892478082 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala @@ -305,7 +305,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { val plan = statement.executeQuery("explain select * from test_table") plan.next() plan.next() - assert(plan.getString(1).contains("Scan In-memory table `test_table`")) + assert(plan.getString(1).contains("Scan In-memory table test_table")) val rs1 = statement.executeQuery("SELECT key FROM test_table ORDER BY KEY DESC") val buf1 = new collection.mutable.ArrayBuffer[Int]() @@ -391,7 +391,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { val plan = statement.executeQuery("explain select key from test_map ORDER BY key DESC") plan.next() plan.next() - assert(plan.getString(1).contains("Scan In-memory table `test_table`")) + assert(plan.getString(1).contains("Scan In-memory table test_table")) val rs = statement.executeQuery("SELECT key FROM test_map ORDER BY KEY DESC") val buf = new collection.mutable.ArrayBuffer[Int]() diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala index fc793534641df..81c3f271b18d4 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala @@ -113,7 +113,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto e = intercept[AnalysisException] { sql("UNCACHE TABLE nonexistentTable") }.getMessage - assert(e.contains(s"$expectedErrorMsg default.nonexistentTable")) + assert(e.contains(s"$expectedErrorMsg nonexistentTable")) sql("UNCACHE TABLE IF EXISTS nonexistentTable") } @@ -364,14 +364,14 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto // Cache the table 'cachedTable' in temp db with qualified table name, // and then check whether the table is cached with expected name sql(s"CACHE TABLE $db.cachedTable OPTIONS('storageLevel' 'MEMORY_ONLY')") - assertCached(sql(s"SELECT * FROM $db.cachedTable"), s"`$db`.`cachedTable`", MEMORY_ONLY) + assertCached(sql(s"SELECT * FROM $db.cachedTable"), s"$db.cachedTable", MEMORY_ONLY) assert(spark.catalog.isCached(s"$db.cachedTable"), s"Table '$db.cachedTable' should be cached.") // Refresh the table 'cachedTable' in temp db with qualified table name, and then check // whether the table is still cached with the same name and storage level. sql(s"REFRESH TABLE $db.cachedTable") - assertCached(sql(s"select * from $db.cachedTable"), s"`$db`.`cachedTable`", MEMORY_ONLY) + assertCached(sql(s"select * from $db.cachedTable"), s"$db.cachedTable", MEMORY_ONLY) assert(spark.catalog.isCached(s"$db.cachedTable"), s"Table '$db.cachedTable' should be cached after refreshing with its qualified name.") @@ -382,7 +382,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto // 'cachedTable', instead of '$db.cachedTable' activateDatabase(db) { sql("REFRESH TABLE cachedTable") - assertCached(sql("SELECT * FROM cachedTable"), s"`$db`.`cachedTable`", MEMORY_ONLY) + assertCached(sql("SELECT * FROM cachedTable"), s"$db.cachedTable", MEMORY_ONLY) assert(spark.catalog.isCached("cachedTable"), s"Table '$db.cachedTable' should be cached after refreshing with its " + "unqualified name.") @@ -403,13 +403,13 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto // Cache the table 'cachedTable' in default db without qualified table name , and then // check whether the table is cached with expected name. sql("CACHE TABLE cachedTable OPTIONS('storageLevel' 'DISK_ONLY')") - assertCached(sql("SELECT * FROM cachedTable"), "`default`.`cachedTable`", DISK_ONLY) + assertCached(sql("SELECT * FROM cachedTable"), "cachedTable", DISK_ONLY) assert(spark.catalog.isCached("cachedTable"), "Table 'cachedTable' should be cached.") // Refresh the table 'cachedTable' in default db with unqualified table name, and then // check whether the table is still cached with the same name. sql("REFRESH TABLE cachedTable") - assertCached(sql("SELECT * FROM cachedTable"), "`default`.`cachedTable`", DISK_ONLY) + assertCached(sql("SELECT * FROM cachedTable"), "cachedTable", DISK_ONLY) assert(spark.catalog.isCached("cachedTable"), "Table 'cachedTable' should be cached after refreshing with its unqualified name.") @@ -421,7 +421,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto activateDatabase(db) { sql("REFRESH TABLE default.cachedTable") assertCached( - sql("SELECT * FROM default.cachedTable"), "`default`.`cachedTable`", DISK_ONLY) + sql("SELECT * FROM default.cachedTable"), "cachedTable", DISK_ONLY) assert(spark.catalog.isCached("default.cachedTable"), "Table 'cachedTable' should be cached after refreshing with its qualified name.") } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala index f7c13ea047da7..a25c61c96f3d8 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala @@ -596,7 +596,7 @@ private[hive] class TestHiveQueryExecution( override lazy val analyzed: LogicalPlan = sparkSession.withActive { val describedTables = logical match { - case CacheTableCommand(tbl, _, _, _) => tbl :: Nil + case CacheTableCommand(tbl, _, _, _) => tbl.asTableIdentifier :: Nil case _ => Nil } From 225c2e2815988ebf3e0926a4ca2af9a933b48467 Mon Sep 17 00:00:00 2001 From: "xuewei.linxuewei" Date: Mon, 30 Nov 2020 15:36:26 +0900 Subject: [PATCH 045/150] [SPARK-33498][SQL][FOLLOW-UP] Deduplicate the unittest by using checkCastWithParseError ### What changes were proposed in this pull request? Dup code removed in SPARK-33498 as follow-up. ### Why are the changes needed? Nit. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Existing UT. Closes #30540 from leanken/leanken-SPARK-33498. Authored-by: xuewei.linxuewei Signed-off-by: HyukjinKwon --- .../apache/spark/sql/catalyst/expressions/CastSuite.scala | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala index 0900a303b4cbe..d284c417042c1 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala @@ -971,11 +971,7 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase { checkCastWithParseError("20150318") checkCastWithParseError("2015-031-8") checkCastWithParseError("2015-03-18T12:03:17-0:70") - - val input = "abdef" - checkExceptionInExpression[DateTimeException]( - cast(input, TimestampType, Option(zid.getId)), - s"Cannot cast $input to TimestampType.") + checkCastWithParseError("abdef") } } } From b665d5881915f042930f502bcc3c6ee3cb00c50d Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Mon, 30 Nov 2020 17:04:38 +0900 Subject: [PATCH 046/150] [SPARK-28646][SQL] Fix bug of Count so as consistent with mainstream databases ### What changes were proposed in this pull request? Currently, Spark allows calls to `count` even for non parameterless aggregate function. For example, the following query actually works: `SELECT count() FROM tenk1;` On the other hand, mainstream databases will throw an error. **Oracle** `> ORA-00909: invalid number of arguments` **PgSQL** `ERROR: count(*) must be used to call a parameterless aggregate function` **MySQL** `> 1064 - You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near ')` ### Why are the changes needed? Fix a bug so that consistent with mainstream databases. There is an example query output with/without this fix. `SELECT count() FROM testData;` The output before this fix: `0` The output after this fix: ``` org.apache.spark.sql.AnalysisException cannot resolve 'count()' due to data type mismatch: count requires at least one argument.; line 1 pos 7 ``` ### Does this PR introduce _any_ user-facing change? Yes. If not specify parameter for `count`, will throw an error. ### How was this patch tested? Jenkins test. Closes #30541 from beliefer/SPARK-28646. Lead-authored-by: gengjiaan Co-authored-by: beliefer Signed-off-by: HyukjinKwon --- .../sql/catalyst/expressions/aggregate/Count.scala | 10 ++++++++++ .../src/test/resources/sql-tests/inputs/count.sql | 3 +++ .../test/resources/sql-tests/results/count.sql.out | 13 +++++++++++-- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala index e043c81975066..e4488b26f197e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.catalyst.expressions.aggregate +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ @@ -43,11 +44,20 @@ import org.apache.spark.sql.types._ since = "1.0.0") // scalastyle:on line.size.limit case class Count(children: Seq[Expression]) extends DeclarativeAggregate { + override def nullable: Boolean = false // Return data type. override def dataType: DataType = LongType + override def checkInputDataTypes(): TypeCheckResult = { + if (children.isEmpty) { + TypeCheckResult.TypeCheckFailure(s"$prettyName requires at least one argument.") + } else { + TypeCheckResult.TypeCheckSuccess + } + } + protected lazy val count = AttributeReference("count", LongType, nullable = false)() override lazy val aggBufferAttributes = count :: Nil diff --git a/sql/core/src/test/resources/sql-tests/inputs/count.sql b/sql/core/src/test/resources/sql-tests/inputs/count.sql index 203f04c589373..fc0d66258ea29 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/count.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/count.sql @@ -35,3 +35,6 @@ SELECT count(DISTINCT a), count(DISTINCT 3,2) FROM testData; SELECT count(DISTINCT a), count(DISTINCT 2), count(DISTINCT 2,3) FROM testData; SELECT count(DISTINCT a), count(DISTINCT 2), count(DISTINCT 3,2) FROM testData; SELECT count(distinct 0.8), percentile_approx(distinct a, 0.8) FROM testData; + +-- count without expressions +SELECT count() FROM testData; diff --git a/sql/core/src/test/resources/sql-tests/results/count.sql.out b/sql/core/src/test/resources/sql-tests/results/count.sql.out index c0cdd0d697538..64614b5b67784 100644 --- a/sql/core/src/test/resources/sql-tests/results/count.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/count.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 13 +-- Number of queries: 14 -- !query @@ -116,4 +116,13 @@ SELECT count(distinct 0.8), percentile_approx(distinct a, 0.8) FROM testData -- !query schema struct -- !query output -1 2 \ No newline at end of file +1 2 + + +-- !query +SELECT count() FROM testData +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'count()' due to data type mismatch: count requires at least one argument.; line 1 pos 7 \ No newline at end of file From 5cfbdddefe0753c3aff03f326b31c0ba8882b3a9 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Mon, 30 Nov 2020 09:23:05 +0000 Subject: [PATCH 047/150] [SPARK-33480][SQL] Support char/varchar type ### What changes were proposed in this pull request? This PR adds the char/varchar type which is kind of a variant of string type: 1. Char type is fixed-length string. When comparing char type values, we need to pad the shorter one to the longer length. 2. Varchar type is string with a length limitation. To implement the char/varchar semantic, this PR: 1. Do string length check when writing to char/varchar type columns. 2. Do string padding when reading char type columns. We don't do it at the writing side to save storage space. 3. Do string padding when comparing char type column with string literal or another char type column. (string literal is fixed length so should be treated as char type as well) To simplify the implementation, this PR doesn't propagate char/varchar type info through functions/operators(e.g. `substring`). That said, a column can only be char/varchar type if it's a table column, not a derived column like `SELECT substring(col)`. To be safe, this PR doesn't add char/varchar type to the query engine(expression input check, internal row framework, codegen framework, etc.). We will replace char/varchar type by string type with metadata (`Attribute.metadata` or `StructField.metadata`) that includes the original type string before it goes into the query engine. That said, the existing code will not see char/varchar type but only string type. char/varchar type may come from several places: 1. v1 table from hive catalog. 2. v2 table from v2 catalog. 3. user-specified schema in `spark.read.schema` and `spark.readStream.schema` 4. `Column.cast` 5. schema string in places like `from_json`, pandas UDF, etc. These places use SQL parser which replaces char/varchar with string already, even before this PR. This PR covers all the above cases, implements the length check and padding feature by looking at string type with special metadata. ### Why are the changes needed? char and varchar are standard SQL types. varchar is widely used in other databases instead of string type. ### Does this PR introduce _any_ user-facing change? For hive tables: now the table insertion fails if the value exceeds char/varchar length. Previously we truncate the value silently. For other tables: 1. now char type is allowed. 2. now we have length check when inserting to varchar columns. Previously we write the value as it is. ### How was this patch tested? new tests Closes #30412 from cloud-fan/char. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan --- docs/sql-ref-datatypes.md | 2 + .../sql/catalyst/analysis/Analyzer.scala | 9 +- .../sql/catalyst/analysis/CheckAnalysis.scala | 6 +- .../catalyst/analysis/ResolveCatalogs.scala | 5 - .../analysis/ResolvePartitionSpec.scala | 4 +- .../analysis/TableOutputResolver.scala | 19 +- .../sql/catalyst/catalog/SessionCatalog.scala | 7 +- .../sql/catalyst/parser/AstBuilder.scala | 17 +- .../catalyst/plans/logical/v2Commands.scala | 4 +- .../sql/catalyst/util/CharVarcharUtils.scala | 276 ++++++++++ .../sql/connector/catalog/CatalogV2Util.scala | 18 +- .../datasources/v2/DataSourceV2Relation.scala | 8 +- .../org/apache/spark/sql/types/CharType.scala | 38 ++ .../org/apache/spark/sql/types/DataType.scala | 10 +- .../spark/sql/types/HiveStringType.scala | 81 --- .../apache/spark/sql/types/VarcharType.scala | 37 ++ .../org/apache/spark/sql/types/package.scala | 10 +- .../sql/catalyst/analysis/AnalysisSuite.scala | 18 +- .../parser/TableSchemaParserSuite.scala | 15 +- .../spark/sql/connector/InMemoryTable.scala | 15 +- .../catalog/CatalogV2UtilSuite.scala | 2 +- .../scala/org/apache/spark/sql/Column.scala | 6 +- .../apache/spark/sql/DataFrameReader.scala | 4 +- .../analysis/ResolveSessionCatalog.scala | 37 +- .../datasources/ApplyCharTypePadding.scala | 135 +++++ .../datasources/LogicalRelation.scala | 18 +- .../datasources/jdbc/JdbcUtils.scala | 19 +- .../datasources/v2/PushDownUtils.scala | 4 +- .../internal/BaseSessionStateBuilder.scala | 1 + .../sql/streaming/DataStreamReader.scala | 4 +- .../spark/sql/CharVarcharTestSuite.scala | 505 ++++++++++++++++++ .../command/PlanResolutionSuite.scala | 44 +- .../spark/sql/sources/TableScanSuite.scala | 14 +- .../sql/hive/HiveSessionStateBuilder.scala | 1 + .../sql/hive/client/HiveClientImpl.scala | 19 +- .../spark/sql/HiveCharVarcharTestSuite.scala | 43 ++ .../sql/hive/HiveMetastoreCatalogSuite.scala | 15 +- .../sql/hive/execution/HiveDDLSuite.scala | 4 +- 38 files changed, 1172 insertions(+), 302 deletions(-) create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/types/CharType.scala delete mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/types/HiveStringType.scala create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/types/VarcharType.scala create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ApplyCharTypePadding.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/HiveCharVarcharTestSuite.scala diff --git a/docs/sql-ref-datatypes.md b/docs/sql-ref-datatypes.md index f27f1a0ca967f..0087867a8c7f7 100644 --- a/docs/sql-ref-datatypes.md +++ b/docs/sql-ref-datatypes.md @@ -37,6 +37,8 @@ Spark SQL and DataFrames support the following data types: - `DecimalType`: Represents arbitrary-precision signed decimal numbers. Backed internally by `java.math.BigDecimal`. A `BigDecimal` consists of an arbitrary precision integer unscaled value and a 32-bit integer scale. * String type - `StringType`: Represents character string values. + - `VarcharType(length)`: A variant of `StringType` which has a length limitation. Data writing will fail if the input string exceeds the length limitation. Note: this type can only be used in table schema, not functions/operators. + - `CharType(length)`: A variant of `VarcharType(length)` which is fixed length. Reading column of type `CharType(n)` always returns string values of length `n`. Char type column comparison will pad the short one to the longer length. * Binary type - `BinaryType`: Represents byte sequence values. * Boolean type diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 9b599b4c8f8d4..23a1b7bdde93c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -39,7 +39,7 @@ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.catalyst.streaming.StreamingRelationV2 import org.apache.spark.sql.catalyst.trees.TreeNodeRef -import org.apache.spark.sql.catalyst.util.toPrettySQL +import org.apache.spark.sql.catalyst.util.{toPrettySQL, CharVarcharUtils} import org.apache.spark.sql.connector.catalog._ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ import org.apache.spark.sql.connector.catalog.TableChange.{AddColumn, After, ColumnChange, ColumnPosition, DeleteColumn, RenameColumn, UpdateColumnComment, UpdateColumnNullability, UpdateColumnPosition, UpdateColumnType} @@ -3102,7 +3102,12 @@ class Analyzer(override val catalogManager: CatalogManager) val projection = TableOutputResolver.resolveOutputColumns( v2Write.table.name, v2Write.table.output, v2Write.query, v2Write.isByName, conf) if (projection != v2Write.query) { - v2Write.withNewQuery(projection) + val cleanedTable = v2Write.table match { + case r: DataSourceV2Relation => + r.copy(output = r.output.map(CharVarcharUtils.cleanAttrMetadata)) + case other => other + } + v2Write.withNewQuery(projection).withNewTable(cleanedTable) } else { v2Write } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 7f89c130749f4..2818ba58075cd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.optimizer.BooleanSimplification import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.catalyst.util.TypeUtils +import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, TypeUtils} import org.apache.spark.sql.connector.catalog.{SupportsAtomicPartitionManagement, SupportsPartitionManagement, Table} import org.apache.spark.sql.connector.catalog.TableChange.{AddColumn, After, ColumnPosition, DeleteColumn, RenameColumn, UpdateColumnComment, UpdateColumnNullability, UpdateColumnPosition, UpdateColumnType} import org.apache.spark.sql.internal.SQLConf @@ -94,6 +94,10 @@ trait CheckAnalysis extends PredicateHelper { case p if p.analyzed => // Skip already analyzed sub-plans + case leaf: LeafNode if leaf.output.map(_.dataType).exists(CharVarcharUtils.hasCharVarchar) => + throw new IllegalStateException( + "[BUG] logical plan should not have output of char/varchar type: " + leaf) + case u: UnresolvedNamespace => u.failAnalysis(s"Namespace not found: ${u.multipartIdentifier.quoted}") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala index 7354d2478b7c8..a90de697bc084 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala @@ -35,7 +35,6 @@ class ResolveCatalogs(val catalogManager: CatalogManager) case AlterTableAddColumnsStatement( nameParts @ NonSessionCatalogAndTable(catalog, tbl), cols) => cols.foreach(c => failNullType(c.dataType)) - cols.foreach(c => failCharType(c.dataType)) val changes = cols.map { col => TableChange.addColumn( col.name.toArray, @@ -49,7 +48,6 @@ class ResolveCatalogs(val catalogManager: CatalogManager) case AlterTableReplaceColumnsStatement( nameParts @ NonSessionCatalogAndTable(catalog, tbl), cols) => cols.foreach(c => failNullType(c.dataType)) - cols.foreach(c => failCharType(c.dataType)) val changes: Seq[TableChange] = loadTable(catalog, tbl.asIdentifier) match { case Some(table) => // REPLACE COLUMNS deletes all the existing columns and adds new columns specified. @@ -72,7 +70,6 @@ class ResolveCatalogs(val catalogManager: CatalogManager) case a @ AlterTableAlterColumnStatement( nameParts @ NonSessionCatalogAndTable(catalog, tbl), _, _, _, _, _) => a.dataType.foreach(failNullType) - a.dataType.foreach(failCharType) val colName = a.column.toArray val typeChange = a.dataType.map { newDataType => TableChange.updateColumnType(colName, newDataType) @@ -145,7 +142,6 @@ class ResolveCatalogs(val catalogManager: CatalogManager) case c @ CreateTableStatement( NonSessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _, _, _) => assertNoNullTypeInSchema(c.tableSchema) - assertNoCharTypeInSchema(c.tableSchema) CreateV2Table( catalog.asTableCatalog, tbl.asIdentifier, @@ -173,7 +169,6 @@ class ResolveCatalogs(val catalogManager: CatalogManager) case c @ ReplaceTableStatement( NonSessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _, _) => assertNoNullTypeInSchema(c.tableSchema) - assertNoCharTypeInSchema(c.tableSchema) ReplaceTable( catalog.asTableCatalog, tbl.asIdentifier, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolvePartitionSpec.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolvePartitionSpec.scala index 6d061fce06919..98c6872a47cc6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolvePartitionSpec.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolvePartitionSpec.scala @@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Cast, Literal} import org.apache.spark.sql.catalyst.plans.logical.{AlterTableAddPartition, AlterTableDropPartition, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.connector.catalog.SupportsPartitionManagement import org.apache.spark.sql.types._ import org.apache.spark.sql.util.PartitioningUtils.normalizePartitionSpec @@ -66,7 +67,8 @@ object ResolvePartitionSpec extends Rule[LogicalPlan] { val partValues = partSchema.map { part => val raw = normalizedSpec.get(part.name).orNull - Cast(Literal.create(raw, StringType), part.dataType, Some(conf.sessionLocalTimeZone)).eval() + val dt = CharVarcharUtils.replaceCharVarcharWithString(part.dataType) + Cast(Literal.create(raw, StringType), dt, Some(conf.sessionLocalTimeZone)).eval() } InternalRow.fromSeq(partValues) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala index 4f33ca99c02db..d5c407b47c5be 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala @@ -22,6 +22,7 @@ import scala.collection.mutable import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Alias, AnsiCast, Attribute, Cast, NamedExpression} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} +import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy import org.apache.spark.sql.types.DataType @@ -93,19 +94,17 @@ object TableOutputResolver { tableAttr.metadata == queryExpr.metadata) { Some(queryExpr) } else { - // Renaming is needed for handling the following cases like - // 1) Column names/types do not match, e.g., INSERT INTO TABLE tab1 SELECT 1, 2 - // 2) Target tables have column metadata - storeAssignmentPolicy match { + val casted = storeAssignmentPolicy match { case StoreAssignmentPolicy.ANSI => - Some(Alias( - AnsiCast(queryExpr, tableAttr.dataType, Option(conf.sessionLocalTimeZone)), - tableAttr.name)(explicitMetadata = Option(tableAttr.metadata))) + AnsiCast(queryExpr, tableAttr.dataType, Option(conf.sessionLocalTimeZone)) case _ => - Some(Alias( - Cast(queryExpr, tableAttr.dataType, Option(conf.sessionLocalTimeZone)), - tableAttr.name)(explicitMetadata = Option(tableAttr.metadata))) + Cast(queryExpr, tableAttr.dataType, Option(conf.sessionLocalTimeZone)) } + val exprWithStrLenCheck = CharVarcharUtils.stringLengthCheck(casted, tableAttr) + // Renaming is needed for handling the following cases like + // 1) Column names/types do not match, e.g., INSERT INTO TABLE tab1 SELECT 1, 2 + // 2) Target tables have column metadata + Some(Alias(exprWithStrLenCheck, tableAttr.name)(explicitMetadata = Some(tableAttr.metadata))) } storeAssignmentPolicy match { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 5122ca7521d9a..01bce079610ae 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -38,7 +38,7 @@ import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionInfo, ImplicitCastInputTypes} import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParserInterface} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias, View} -import org.apache.spark.sql.catalyst.util.StringUtils +import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, StringUtils} import org.apache.spark.sql.connector.catalog.CatalogManager import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.GLOBAL_TEMP_DATABASE @@ -473,7 +473,10 @@ class SessionCatalog( val table = formatTableName(name.table) requireDbExists(db) requireTableExists(TableIdentifier(table, Some(db))) - externalCatalog.getTable(db, table) + val t = externalCatalog.getTable(db, table) + // We replace char/varchar with "annotated" string type in the table schema, as the query + // engine doesn't support char/varchar yet. + t.copy(schema = CharVarcharUtils.replaceCharVarcharWithStringInSchema(t.schema)) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index a31d7ca7268a6..ce95ea4b41def 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -36,8 +36,8 @@ import org.apache.spark.sql.catalyst.expressions.aggregate.{First, Last} import org.apache.spark.sql.catalyst.parser.SqlBaseParser._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, IntervalUtils} import org.apache.spark.sql.catalyst.util.DateTimeUtils.{getZoneId, stringToDate, stringToTimestamp} -import org.apache.spark.sql.catalyst.util.IntervalUtils import org.apache.spark.sql.catalyst.util.IntervalUtils.IntervalUnit import org.apache.spark.sql.connector.catalog.{SupportsNamespaces, TableCatalog} import org.apache.spark.sql.connector.catalog.TableChange.ColumnPosition @@ -99,7 +99,9 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } override def visitSingleTableSchema(ctx: SingleTableSchemaContext): StructType = { - withOrigin(ctx)(StructType(visitColTypeList(ctx.colTypeList))) + val schema = CharVarcharUtils.replaceCharVarcharWithStringInSchema( + StructType(visitColTypeList(ctx.colTypeList))) + withOrigin(ctx)(schema) } def parseRawDataType(ctx: SingleDataTypeContext): DataType = withOrigin(ctx) { @@ -2226,7 +2228,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg * Create a Spark DataType. */ private def visitSparkDataType(ctx: DataTypeContext): DataType = { - HiveStringType.replaceCharType(typedVisit(ctx)) + CharVarcharUtils.replaceCharVarcharWithString(typedVisit(ctx)) } /** @@ -2301,16 +2303,9 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg builder.putString("comment", _) } - // Add Hive type string to metadata. - val rawDataType = typedVisit[DataType](ctx.dataType) - val cleanedDataType = HiveStringType.replaceCharType(rawDataType) - if (rawDataType != cleanedDataType) { - builder.putString(HIVE_TYPE_STRING, rawDataType.catalogString) - } - StructField( name = colName.getText, - dataType = cleanedDataType, + dataType = typedVisit[DataType](ctx.dataType), nullable = NULL == null, metadata = builder.build()) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala index ebf41f6a6e304..4931f0eb2c007 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.catalyst.analysis.{NamedRelation, PartitionSpec, Res import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet, Expression, Unevaluable} import org.apache.spark.sql.catalyst.plans.DescribeCommandSchema +import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.connector.catalog._ import org.apache.spark.sql.connector.catalog.TableChange.{AddColumn, ColumnChange} import org.apache.spark.sql.connector.expressions.Transform @@ -45,9 +46,10 @@ trait V2WriteCommand extends Command { table.skipSchemaResolution || (query.output.size == table.output.size && query.output.zip(table.output).forall { case (inAttr, outAttr) => + val outType = CharVarcharUtils.getRawType(outAttr.metadata).getOrElse(outAttr.dataType) // names and types must match, nullability must be compatible inAttr.name == outAttr.name && - DataType.equalsIgnoreCompatibleNullability(inAttr.dataType, outAttr.dataType) && + DataType.equalsIgnoreCompatibleNullability(inAttr.dataType, outType) && (outAttr.nullable || !inAttr.nullable) }) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala new file mode 100644 index 0000000000000..0cbe5abdbbd7a --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala @@ -0,0 +1,276 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.util + +import scala.collection.mutable + +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser +import org.apache.spark.sql.types._ + +object CharVarcharUtils { + + private val CHAR_VARCHAR_TYPE_STRING_METADATA_KEY = "__CHAR_VARCHAR_TYPE_STRING" + + /** + * Replaces CharType/VarcharType with StringType recursively in the given struct type. If a + * top-level StructField's data type is CharType/VarcharType or has nested CharType/VarcharType, + * this method will add the original type string to the StructField's metadata, so that we can + * re-construct the original data type with CharType/VarcharType later when needed. + */ + def replaceCharVarcharWithStringInSchema(st: StructType): StructType = { + StructType(st.map { field => + if (hasCharVarchar(field.dataType)) { + val metadata = new MetadataBuilder().withMetadata(field.metadata) + .putString(CHAR_VARCHAR_TYPE_STRING_METADATA_KEY, field.dataType.sql).build() + field.copy(dataType = replaceCharVarcharWithString(field.dataType), metadata = metadata) + } else { + field + } + }) + } + + /** + * Returns true if the given data type is CharType/VarcharType or has nested CharType/VarcharType. + */ + def hasCharVarchar(dt: DataType): Boolean = { + dt.existsRecursively(f => f.isInstanceOf[CharType] || f.isInstanceOf[VarcharType]) + } + + /** + * Replaces CharType/VarcharType with StringType recursively in the given data type. + */ + def replaceCharVarcharWithString(dt: DataType): DataType = dt match { + case ArrayType(et, nullable) => + ArrayType(replaceCharVarcharWithString(et), nullable) + case MapType(kt, vt, nullable) => + MapType(replaceCharVarcharWithString(kt), replaceCharVarcharWithString(vt), nullable) + case StructType(fields) => + StructType(fields.map { field => + field.copy(dataType = replaceCharVarcharWithString(field.dataType)) + }) + case _: CharType => StringType + case _: VarcharType => StringType + case _ => dt + } + + /** + * Removes the metadata entry that contains the original type string of CharType/VarcharType from + * the given attribute's metadata. + */ + def cleanAttrMetadata(attr: AttributeReference): AttributeReference = { + val cleaned = new MetadataBuilder().withMetadata(attr.metadata) + .remove(CHAR_VARCHAR_TYPE_STRING_METADATA_KEY).build() + attr.withMetadata(cleaned) + } + + /** + * Re-construct the original data type from the type string in the given metadata. + * This is needed when dealing with char/varchar columns/fields. + */ + def getRawType(metadata: Metadata): Option[DataType] = { + if (metadata.contains(CHAR_VARCHAR_TYPE_STRING_METADATA_KEY)) { + Some(CatalystSqlParser.parseRawDataType( + metadata.getString(CHAR_VARCHAR_TYPE_STRING_METADATA_KEY))) + } else { + None + } + } + + /** + * Returns expressions to apply read-side char type padding for the given attributes. String + * values should be right-padded to N characters if it's from a CHAR(N) column/field. + */ + def charTypePadding(output: Seq[AttributeReference]): Seq[NamedExpression] = { + output.map { attr => + getRawType(attr.metadata).filter { rawType => + rawType.existsRecursively(_.isInstanceOf[CharType]) + }.map { rawType => + Alias(charTypePadding(attr, rawType), attr.name)(explicitMetadata = Some(attr.metadata)) + }.getOrElse(attr) + } + } + + private def charTypePadding(expr: Expression, dt: DataType): Expression = dt match { + case CharType(length) => StringRPad(expr, Literal(length)) + + case StructType(fields) => + val struct = CreateNamedStruct(fields.zipWithIndex.flatMap { case (f, i) => + Seq(Literal(f.name), charTypePadding(GetStructField(expr, i, Some(f.name)), f.dataType)) + }) + if (expr.nullable) { + If(IsNull(expr), Literal(null, struct.dataType), struct) + } else { + struct + } + + case ArrayType(et, containsNull) => charTypePaddingInArray(expr, et, containsNull) + + case MapType(kt, vt, valueContainsNull) => + val newKeys = charTypePaddingInArray(MapKeys(expr), kt, containsNull = false) + val newValues = charTypePaddingInArray(MapValues(expr), vt, valueContainsNull) + MapFromArrays(newKeys, newValues) + + case _ => expr + } + + private def charTypePaddingInArray( + arr: Expression, et: DataType, containsNull: Boolean): Expression = { + val param = NamedLambdaVariable("x", replaceCharVarcharWithString(et), containsNull) + val func = LambdaFunction(charTypePadding(param, et), Seq(param)) + ArrayTransform(arr, func) + } + + /** + * Returns an expression to apply write-side string length check for the given expression. A + * string value can not exceed N characters if it's written into a CHAR(N)/VARCHAR(N) + * column/field. + */ + def stringLengthCheck(expr: Expression, targetAttr: Attribute): Expression = { + getRawType(targetAttr.metadata).map { rawType => + stringLengthCheck(expr, rawType) + }.getOrElse(expr) + } + + private def raiseError(expr: Expression, typeName: String, length: Int): Expression = { + val errorMsg = Concat(Seq( + Literal("input string '"), + expr, + Literal(s"' exceeds $typeName type length limitation: $length"))) + Cast(RaiseError(errorMsg), StringType) + } + + private def stringLengthCheck(expr: Expression, dt: DataType): Expression = dt match { + case CharType(length) => + val trimmed = StringTrimRight(expr) + // Trailing spaces do not count in the length check. We don't need to retain the trailing + // spaces, as we will pad char type columns/fields at read time. + If( + GreaterThan(Length(trimmed), Literal(length)), + raiseError(expr, "char", length), + trimmed) + + case VarcharType(length) => + val trimmed = StringTrimRight(expr) + // Trailing spaces do not count in the length check. We need to retain the trailing spaces + // (truncate to length N), as there is no read-time padding for varchar type. + // TODO: create a special TrimRight function that can trim to a certain length. + If( + LessThanOrEqual(Length(expr), Literal(length)), + expr, + If( + GreaterThan(Length(trimmed), Literal(length)), + raiseError(expr, "varchar", length), + StringRPad(trimmed, Literal(length)))) + + case StructType(fields) => + val struct = CreateNamedStruct(fields.zipWithIndex.flatMap { case (f, i) => + Seq(Literal(f.name), stringLengthCheck(GetStructField(expr, i, Some(f.name)), f.dataType)) + }) + if (expr.nullable) { + If(IsNull(expr), Literal(null, struct.dataType), struct) + } else { + struct + } + + case ArrayType(et, containsNull) => stringLengthCheckInArray(expr, et, containsNull) + + case MapType(kt, vt, valueContainsNull) => + val newKeys = stringLengthCheckInArray(MapKeys(expr), kt, containsNull = false) + val newValues = stringLengthCheckInArray(MapValues(expr), vt, valueContainsNull) + MapFromArrays(newKeys, newValues) + + case _ => expr + } + + private def stringLengthCheckInArray( + arr: Expression, et: DataType, containsNull: Boolean): Expression = { + val param = NamedLambdaVariable("x", replaceCharVarcharWithString(et), containsNull) + val func = LambdaFunction(stringLengthCheck(param, et), Seq(param)) + ArrayTransform(arr, func) + } + + /** + * Return expressions to apply char type padding for the string comparison between the given + * attributes. When comparing two char type columns/fields, we need to pad the shorter one to + * the longer length. + */ + def addPaddingInStringComparison(attrs: Seq[Attribute]): Seq[Expression] = { + val rawTypes = attrs.map(attr => getRawType(attr.metadata)) + if (rawTypes.exists(_.isEmpty)) { + attrs + } else { + val typeWithTargetCharLength = rawTypes.map(_.get).reduce(typeWithWiderCharLength) + attrs.zip(rawTypes.map(_.get)).map { case (attr, rawType) => + padCharToTargetLength(attr, rawType, typeWithTargetCharLength).getOrElse(attr) + } + } + } + + private def typeWithWiderCharLength(type1: DataType, type2: DataType): DataType = { + (type1, type2) match { + case (CharType(len1), CharType(len2)) => + CharType(math.max(len1, len2)) + case (StructType(fields1), StructType(fields2)) => + assert(fields1.length == fields2.length) + StructType(fields1.zip(fields2).map { case (left, right) => + StructField("", typeWithWiderCharLength(left.dataType, right.dataType)) + }) + case (ArrayType(et1, _), ArrayType(et2, _)) => + ArrayType(typeWithWiderCharLength(et1, et2)) + case _ => NullType + } + } + + private def padCharToTargetLength( + expr: Expression, + rawType: DataType, + typeWithTargetCharLength: DataType): Option[Expression] = { + (rawType, typeWithTargetCharLength) match { + case (CharType(len), CharType(target)) if target > len => + Some(StringRPad(expr, Literal(target))) + + case (StructType(fields), StructType(targets)) => + assert(fields.length == targets.length) + var i = 0 + var needPadding = false + val createStructExprs = mutable.ArrayBuffer.empty[Expression] + while (i < fields.length) { + val field = fields(i) + val fieldExpr = GetStructField(expr, i, Some(field.name)) + val padded = padCharToTargetLength(fieldExpr, field.dataType, targets(i).dataType) + needPadding = padded.isDefined + createStructExprs += Literal(field.name) + createStructExprs += padded.getOrElse(fieldExpr) + i += 1 + } + if (needPadding) Some(CreateNamedStruct(createStructExprs.toSeq)) else None + + case (ArrayType(et, containsNull), ArrayType(target, _)) => + val param = NamedLambdaVariable("x", replaceCharVarcharWithString(et), containsNull) + padCharToTargetLength(param, et, target).map { padded => + val func = LambdaFunction(padded, Seq(param)) + ArrayTransform(expr, func) + } + + // We don't handle MapType here as it's not comparable. + + case _ => None + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala index b6dc4f61c8588..02db2293ec64a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogV2Util.scala @@ -24,11 +24,10 @@ import scala.collection.JavaConverters._ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.{NamedRelation, NoSuchDatabaseException, NoSuchNamespaceException, NoSuchTableException, UnresolvedV2Relation} -import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.plans.logical.{AlterTable, CreateTableAsSelectStatement, CreateTableStatement, ReplaceTableAsSelectStatement, ReplaceTableStatement, SerdeInfo} import org.apache.spark.sql.connector.catalog.TableChange._ import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation -import org.apache.spark.sql.types.{ArrayType, DataType, HIVE_TYPE_STRING, HiveStringType, MapType, NullType, StructField, StructType} +import org.apache.spark.sql.types.{ArrayType, DataType, MapType, NullType, StructField, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.Utils @@ -379,21 +378,6 @@ private[sql] object CatalogV2Util { .asTableCatalog } - def failCharType(dt: DataType): Unit = { - if (HiveStringType.containsCharType(dt)) { - throw new AnalysisException( - "Cannot use CHAR type in non-Hive-Serde tables, please use STRING type instead.") - } - } - - def assertNoCharTypeInSchema(schema: StructType): Unit = { - schema.foreach { f => - if (f.metadata.contains(HIVE_TYPE_STRING)) { - failCharType(CatalystSqlParser.parseRawDataType(f.metadata.getString(HIVE_TYPE_STRING))) - } - } - } - def failNullType(dt: DataType): Unit = { def containsNullType(dt: DataType): Boolean = dt match { case ArrayType(et, _) => containsNullType(et) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala index f541411daeff4..4debdd380e6b4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, NamedRelation} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} -import org.apache.spark.sql.catalyst.util.truncatedString +import org.apache.spark.sql.catalyst.util.{truncatedString, CharVarcharUtils} import org.apache.spark.sql.connector.catalog.{CatalogPlugin, Identifier, MetadataColumn, SupportsMetadataColumns, Table, TableCapability} import org.apache.spark.sql.connector.read.{Scan, Statistics => V2Statistics, SupportsReportStatistics} import org.apache.spark.sql.connector.read.streaming.{Offset, SparkDataStream} @@ -171,8 +171,10 @@ object DataSourceV2Relation { catalog: Option[CatalogPlugin], identifier: Option[Identifier], options: CaseInsensitiveStringMap): DataSourceV2Relation = { - val output = table.schema().toAttributes - DataSourceV2Relation(table, output, catalog, identifier, options) + // The v2 source may return schema containing char/varchar type. We replace char/varchar + // with "annotated" string type here as the query engine doesn't support char/varchar yet. + val schema = CharVarcharUtils.replaceCharVarcharWithStringInSchema(table.schema) + DataSourceV2Relation(table, schema.toAttributes, catalog, identifier, options) } def create( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CharType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CharType.scala new file mode 100644 index 0000000000000..67ab1cc2f3321 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CharType.scala @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.types + +import scala.math.Ordering +import scala.reflect.runtime.universe.typeTag + +import org.apache.spark.annotation.Experimental +import org.apache.spark.unsafe.types.UTF8String + +@Experimental +case class CharType(length: Int) extends AtomicType { + require(length >= 0, "The length of char type cannot be negative.") + + private[sql] type InternalType = UTF8String + @transient private[sql] lazy val tag = typeTag[InternalType] + private[sql] val ordering = implicitly[Ordering[InternalType]] + + override def defaultSize: Int = length + override def typeName: String = s"char($length)" + override def toString: String = s"CharType($length)" + private[spark] override def asNullable: CharType = this +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala index 7556a19f0d316..e4ee6eb377a4d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala @@ -124,13 +124,15 @@ abstract class DataType extends AbstractDataType { object DataType { private val FIXED_DECIMAL = """decimal\(\s*(\d+)\s*,\s*(\-?\d+)\s*\)""".r + private val CHAR_TYPE = """char\(\s*(\d+)\s*\)""".r + private val VARCHAR_TYPE = """varchar\(\s*(\d+)\s*\)""".r def fromDDL(ddl: String): DataType = { parseTypeWithFallback( ddl, CatalystSqlParser.parseDataType, "Cannot parse the data type: ", - fallbackParser = CatalystSqlParser.parseTableSchema) + fallbackParser = str => CatalystSqlParser.parseTableSchema(str)) } /** @@ -166,7 +168,7 @@ object DataType { def fromJson(json: String): DataType = parseDataType(parse(json)) - private val nonDecimalNameToType = { + private val otherTypes = { Seq(NullType, DateType, TimestampType, BinaryType, IntegerType, BooleanType, LongType, DoubleType, FloatType, ShortType, ByteType, StringType, CalendarIntervalType) .map(t => t.typeName -> t).toMap @@ -177,7 +179,9 @@ object DataType { name match { case "decimal" => DecimalType.USER_DEFAULT case FIXED_DECIMAL(precision, scale) => DecimalType(precision.toInt, scale.toInt) - case other => nonDecimalNameToType.getOrElse( + case CHAR_TYPE(length) => CharType(length.toInt) + case VARCHAR_TYPE(length) => VarcharType(length.toInt) + case other => otherTypes.getOrElse( other, throw new IllegalArgumentException( s"Failed to convert the JSON string '$name' to a data type.")) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/HiveStringType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/HiveStringType.scala deleted file mode 100644 index a29f49ad14a77..0000000000000 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/HiveStringType.scala +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.sql.types - -import scala.math.Ordering -import scala.reflect.runtime.universe.typeTag - -import org.apache.spark.unsafe.types.UTF8String - -/** - * A hive string type for compatibility. These datatypes should only used for parsing, - * and should NOT be used anywhere else. Any instance of these data types should be - * replaced by a [[StringType]] before analysis. - */ -sealed abstract class HiveStringType extends AtomicType { - private[sql] type InternalType = UTF8String - - private[sql] val ordering = implicitly[Ordering[InternalType]] - - @transient private[sql] lazy val tag = typeTag[InternalType] - - override def defaultSize: Int = length - - private[spark] override def asNullable: HiveStringType = this - - def length: Int -} - -object HiveStringType { - def replaceCharType(dt: DataType): DataType = dt match { - case ArrayType(et, nullable) => - ArrayType(replaceCharType(et), nullable) - case MapType(kt, vt, nullable) => - MapType(replaceCharType(kt), replaceCharType(vt), nullable) - case StructType(fields) => - StructType(fields.map { field => - field.copy(dataType = replaceCharType(field.dataType)) - }) - case _: HiveStringType => StringType - case _ => dt - } - - def containsCharType(dt: DataType): Boolean = dt match { - case ArrayType(et, _) => containsCharType(et) - case MapType(kt, vt, _) => containsCharType(kt) || containsCharType(vt) - case StructType(fields) => fields.exists(f => containsCharType(f.dataType)) - case _ => dt.isInstanceOf[CharType] - } -} - -/** - * Hive char type. Similar to other HiveStringType's, these datatypes should only used for - * parsing, and should NOT be used anywhere else. Any instance of these data types should be - * replaced by a [[StringType]] before analysis. - */ -case class CharType(length: Int) extends HiveStringType { - override def simpleString: String = s"char($length)" -} - -/** - * Hive varchar type. Similar to other HiveStringType's, these datatypes should only used for - * parsing, and should NOT be used anywhere else. Any instance of these data types should be - * replaced by a [[StringType]] before analysis. - */ -case class VarcharType(length: Int) extends HiveStringType { - override def simpleString: String = s"varchar($length)" -} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/VarcharType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/VarcharType.scala new file mode 100644 index 0000000000000..8d78640c1e125 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/VarcharType.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.types + +import scala.math.Ordering +import scala.reflect.runtime.universe.typeTag + +import org.apache.spark.annotation.Experimental +import org.apache.spark.unsafe.types.UTF8String + +@Experimental +case class VarcharType(length: Int) extends AtomicType { + require(length >= 0, "The length of varchar type cannot be negative.") + + private[sql] type InternalType = UTF8String + @transient private[sql] lazy val tag = typeTag[InternalType] + private[sql] val ordering = implicitly[Ordering[InternalType]] + + override def defaultSize: Int = length + override def typeName: String = s"varchar($length)" + override def toString: String = s"CharType($length)" + private[spark] override def asNullable: VarcharType = this +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala index f29cbc2069e39..346a51ea10c82 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala @@ -21,12 +21,4 @@ package org.apache.spark.sql * Contains a type system for attributes produced by relations, including complex types like * structs, arrays and maps. */ -package object types { - /** - * Metadata key used to store the raw hive type string in the metadata of StructField. This - * is relevant for datatypes that do not have a direct Spark SQL counterpart, such as CHAR and - * VARCHAR. We need to preserve the original type in order to invoke the correct object - * inspector in Hive. - */ - val HIVE_TYPE_STRING = "HIVE_TYPE_STRING" -} +package object types diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index f0a24d4a56048..0afa811e5d590 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.analysis import java.util.TimeZone +import scala.collection.JavaConverters._ import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag @@ -41,9 +42,11 @@ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.catalyst.util._ +import org.apache.spark.sql.connector.InMemoryTable +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ - +import org.apache.spark.sql.util.CaseInsensitiveStringMap class AnalysisSuite extends AnalysisTest with Matchers { import org.apache.spark.sql.catalyst.analysis.TestRelations._ @@ -55,6 +58,19 @@ class AnalysisSuite extends AnalysisTest with Matchers { } } + test("fail if a leaf node has char/varchar type output") { + val schema1 = new StructType().add("c", CharType(5)) + val schema2 = new StructType().add("c", VarcharType(5)) + val schema3 = new StructType().add("c", ArrayType(CharType(5))) + Seq(schema1, schema2, schema3).foreach { schema => + val table = new InMemoryTable("t", schema, Array.empty, Map.empty[String, String].asJava) + intercept[IllegalStateException] { + DataSourceV2Relation( + table, schema.toAttributes, None, None, CaseInsensitiveStringMap.empty()).analyze + } + } + } + test("union project *") { val plan = (1 to 120) .map(_ => testRelation) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala index 6803fc307f919..95851d44b4747 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.catalyst.parser import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.types._ class TableSchemaParserSuite extends SparkFunSuite { @@ -57,11 +58,6 @@ class TableSchemaParserSuite extends SparkFunSuite { |anotherArray:Array> """.stripMargin.replace("\n", "") - val builder = new MetadataBuilder - builder.putString(HIVE_TYPE_STRING, - "struct," + - "MAP:map,arrAy:array,anotherArray:array>") - val expectedDataType = StructType( StructField("complexStructCol", StructType( @@ -69,13 +65,12 @@ class TableSchemaParserSuite extends SparkFunSuite { StructType( StructField("deciMal", DecimalType.USER_DEFAULT) :: StructField("anotherDecimal", DecimalType(5, 2)) :: Nil)) :: - StructField("MAP", MapType(TimestampType, StringType)) :: + StructField("MAP", MapType(TimestampType, VarcharType(10))) :: StructField("arrAy", ArrayType(DoubleType)) :: - StructField("anotherArray", ArrayType(StringType)) :: Nil), - nullable = true, - builder.build()) :: Nil) + StructField("anotherArray", ArrayType(CharType(9))) :: Nil)) :: Nil) - assert(parse(tableSchemaString) === expectedDataType) + assert(parse(tableSchemaString) === + CharVarcharUtils.replaceCharVarcharWithStringInSchema(expectedDataType)) } // Negative cases diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala index ffff00b54f1b8..cfb044b428e41 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala @@ -28,7 +28,7 @@ import org.scalatest.Assertions._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, JoinedRow} -import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, DateTimeUtils} import org.apache.spark.sql.connector.catalog._ import org.apache.spark.sql.connector.expressions.{BucketTransform, DaysTransform, HoursTransform, IdentityTransform, MonthsTransform, Transform, YearsTransform} import org.apache.spark.sql.connector.read._ @@ -116,11 +116,12 @@ class InMemoryTable( } } + val cleanedSchema = CharVarcharUtils.replaceCharVarcharWithStringInSchema(schema) partitioning.map { case IdentityTransform(ref) => - extractor(ref.fieldNames, schema, row)._1 + extractor(ref.fieldNames, cleanedSchema, row)._1 case YearsTransform(ref) => - extractor(ref.fieldNames, schema, row) match { + extractor(ref.fieldNames, cleanedSchema, row) match { case (days: Int, DateType) => ChronoUnit.YEARS.between(EPOCH_LOCAL_DATE, DateTimeUtils.daysToLocalDate(days)) case (micros: Long, TimestampType) => @@ -130,7 +131,7 @@ class InMemoryTable( throw new IllegalArgumentException(s"Match: unsupported argument(s) type - ($v, $t)") } case MonthsTransform(ref) => - extractor(ref.fieldNames, schema, row) match { + extractor(ref.fieldNames, cleanedSchema, row) match { case (days: Int, DateType) => ChronoUnit.MONTHS.between(EPOCH_LOCAL_DATE, DateTimeUtils.daysToLocalDate(days)) case (micros: Long, TimestampType) => @@ -140,7 +141,7 @@ class InMemoryTable( throw new IllegalArgumentException(s"Match: unsupported argument(s) type - ($v, $t)") } case DaysTransform(ref) => - extractor(ref.fieldNames, schema, row) match { + extractor(ref.fieldNames, cleanedSchema, row) match { case (days, DateType) => days case (micros: Long, TimestampType) => @@ -149,14 +150,14 @@ class InMemoryTable( throw new IllegalArgumentException(s"Match: unsupported argument(s) type - ($v, $t)") } case HoursTransform(ref) => - extractor(ref.fieldNames, schema, row) match { + extractor(ref.fieldNames, cleanedSchema, row) match { case (micros: Long, TimestampType) => ChronoUnit.HOURS.between(Instant.EPOCH, DateTimeUtils.microsToInstant(micros)) case (v, t) => throw new IllegalArgumentException(s"Match: unsupported argument(s) type - ($v, $t)") } case BucketTransform(numBuckets, ref) => - val (value, dataType) = extractor(ref.fieldNames, schema, row) + val (value, dataType) = extractor(ref.fieldNames, cleanedSchema, row) val valueHashCode = if (value == null) 0 else value.hashCode ((valueHashCode + 31 * dataType.hashCode()) & Integer.MAX_VALUE) % numBuckets } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/CatalogV2UtilSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/CatalogV2UtilSuite.scala index 7a9a7f52ff8fd..da5cfab8be3c7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/CatalogV2UtilSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/CatalogV2UtilSuite.scala @@ -28,7 +28,7 @@ class CatalogV2UtilSuite extends SparkFunSuite { val testCatalog = mock(classOf[TableCatalog]) val ident = mock(classOf[Identifier]) val table = mock(classOf[Table]) - when(table.schema()).thenReturn(mock(classOf[StructType])) + when(table.schema()).thenReturn(new StructType().add("i", "int")) when(testCatalog.loadTable(ident)).thenReturn(table) val r = CatalogV2Util.loadRelation(testCatalog, ident) assert(r.isDefined) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index c164835c753e8..b3e403ffa7382 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.parser.CatalystSqlParser -import org.apache.spark.sql.catalyst.util.toPrettySQL +import org.apache.spark.sql.catalyst.util.{toPrettySQL, CharVarcharUtils} import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions.lit @@ -1181,7 +1181,9 @@ class Column(val expr: Expression) extends Logging { * @group expr_ops * @since 1.3.0 */ - def cast(to: DataType): Column = withExpr { Cast(expr, to) } + def cast(to: DataType): Column = withExpr { + Cast(expr, CharVarcharUtils.replaceCharVarcharWithString(to)) + } /** * Casts the column to a different data type, using the canonical string representation diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 8f96f0b882424..007df183ee353 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.csv.{CSVHeaderChecker, CSVOptions, UnivocityParser} import org.apache.spark.sql.catalyst.expressions.ExprUtils import org.apache.spark.sql.catalyst.json.{CreateJacksonParser, JacksonParser, JSONOptions} -import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, FailureSafeParser} +import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CharVarcharUtils, FailureSafeParser} import org.apache.spark.sql.connector.catalog.{CatalogV2Util, SupportsCatalogOptions, SupportsRead} import org.apache.spark.sql.connector.catalog.TableCapability._ import org.apache.spark.sql.execution.command.DDLUtils @@ -73,7 +73,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * @since 1.4.0 */ def schema(schema: StructType): DataFrameReader = { - this.userSpecifiedSchema = Option(schema) + this.userSpecifiedSchema = Option(CharVarcharUtils.replaceCharVarcharWithStringInSchema(schema)) this } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index 582f11a2be8fa..53edd4fca7794 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource} import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 import org.apache.spark.sql.internal.HiveSerDe -import org.apache.spark.sql.types.{HIVE_TYPE_STRING, HiveStringType, MetadataBuilder, StructField, StructType} +import org.apache.spark.sql.types.{MetadataBuilder, StructField, StructType} /** * Resolves catalogs from the multi-part identifiers in SQL statements, and convert the statements @@ -51,9 +51,6 @@ class ResolveSessionCatalog( cols.foreach(c => failNullType(c.dataType)) loadTable(catalog, tbl.asIdentifier).collect { case v1Table: V1Table => - if (!DDLUtils.isHiveTable(v1Table.v1Table)) { - cols.foreach(c => failCharType(c.dataType)) - } cols.foreach { c => assertTopLevelColumn(c.name, "AlterTableAddColumnsCommand") if (!c.nullable) { @@ -63,7 +60,6 @@ class ResolveSessionCatalog( } AlterTableAddColumnsCommand(tbl.asTableIdentifier, cols.map(convertToStructField)) }.getOrElse { - cols.foreach(c => failCharType(c.dataType)) val changes = cols.map { col => TableChange.addColumn( col.name.toArray, @@ -82,7 +78,6 @@ class ResolveSessionCatalog( case Some(_: V1Table) => throw new AnalysisException("REPLACE COLUMNS is only supported with v2 tables.") case Some(table) => - cols.foreach(c => failCharType(c.dataType)) // REPLACE COLUMNS deletes all the existing columns and adds new columns specified. val deleteChanges = table.schema.fieldNames.map { name => TableChange.deleteColumn(Array(name)) @@ -105,10 +100,6 @@ class ResolveSessionCatalog( a.dataType.foreach(failNullType) loadTable(catalog, tbl.asIdentifier).collect { case v1Table: V1Table => - if (!DDLUtils.isHiveTable(v1Table.v1Table)) { - a.dataType.foreach(failCharType) - } - if (a.column.length > 1) { throw new AnalysisException( "ALTER COLUMN with qualified column is only supported with v2 tables.") @@ -134,19 +125,13 @@ class ResolveSessionCatalog( s"Available: ${v1Table.schema.fieldNames.mkString(", ")}") } } - // Add Hive type string to metadata. - val cleanedDataType = HiveStringType.replaceCharType(dataType) - if (dataType != cleanedDataType) { - builder.putString(HIVE_TYPE_STRING, dataType.catalogString) - } val newColumn = StructField( colName, - cleanedDataType, + dataType, nullable = true, builder.build()) AlterTableChangeColumnCommand(tbl.asTableIdentifier, colName, newColumn) }.getOrElse { - a.dataType.foreach(failCharType) val colName = a.column.toArray val typeChange = a.dataType.map { newDataType => TableChange.updateColumnType(colName, newDataType) @@ -271,16 +256,12 @@ class ResolveSessionCatalog( val (storageFormat, provider) = getStorageFormatAndProvider( c.provider, c.options, c.location, c.serde, ctas = false) if (!isV2Provider(provider)) { - if (!DDLUtils.isHiveTable(Some(provider))) { - assertNoCharTypeInSchema(c.tableSchema) - } val tableDesc = buildCatalogTable(tbl.asTableIdentifier, c.tableSchema, c.partitioning, c.bucketSpec, c.properties, provider, c.location, c.comment, storageFormat, c.external) val mode = if (c.ifNotExists) SaveMode.Ignore else SaveMode.ErrorIfExists CreateTable(tableDesc, mode, None) } else { - assertNoCharTypeInSchema(c.tableSchema) CreateV2Table( catalog.asTableCatalog, tbl.asIdentifier, @@ -305,7 +286,6 @@ class ResolveSessionCatalog( val mode = if (c.ifNotExists) SaveMode.Ignore else SaveMode.ErrorIfExists CreateTable(tableDesc, mode, Some(c.asSelect)) } else { - assertNoCharTypeInSchema(c.schema) CreateTableAsSelect( catalog.asTableCatalog, tbl.asIdentifier, @@ -332,7 +312,6 @@ class ResolveSessionCatalog( if (!isV2Provider(provider)) { throw new AnalysisException("REPLACE TABLE is only supported with v2 tables.") } else { - assertNoCharTypeInSchema(c.tableSchema) ReplaceTable( catalog.asTableCatalog, tbl.asIdentifier, @@ -754,17 +733,7 @@ class ResolveSessionCatalog( private def convertToStructField(col: QualifiedColType): StructField = { val builder = new MetadataBuilder col.comment.foreach(builder.putString("comment", _)) - - val cleanedDataType = HiveStringType.replaceCharType(col.dataType) - if (col.dataType != cleanedDataType) { - builder.putString(HIVE_TYPE_STRING, col.dataType.catalogString) - } - - StructField( - col.name.head, - cleanedDataType, - nullable = true, - builder.build()) + StructField(col.name.head, col.dataType, nullable = true, builder.build()) } private def isV2Provider(provider: String): Boolean = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ApplyCharTypePadding.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ApplyCharTypePadding.scala new file mode 100644 index 0000000000000..35bb86f178eb1 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ApplyCharTypePadding.scala @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import org.apache.spark.sql.catalyst.catalog.HiveTableRelation +import org.apache.spark.sql.catalyst.expressions.{Attribute, BinaryComparison, Expression, In, Literal, StringRPad} +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.util.CharVarcharUtils +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation +import org.apache.spark.sql.types.{CharType, StringType} +import org.apache.spark.unsafe.types.UTF8String + +/** + * This rule applies char type padding in two places: + * 1. When reading values from column/field of type CHAR(N), right-pad the values to length N. + * 2. When comparing char type column/field with string literal or char type column/field, + * right-pad the shorter one to the longer length. + */ +object ApplyCharTypePadding extends Rule[LogicalPlan] { + + override def apply(plan: LogicalPlan): LogicalPlan = { + val padded = plan.resolveOperatorsUpWithNewOutput { + case r: LogicalRelation => + val projectList = CharVarcharUtils.charTypePadding(r.output) + if (projectList == r.output) { + r -> Nil + } else { + val cleanedOutput = r.output.map(CharVarcharUtils.cleanAttrMetadata) + val padded = Project(projectList, r.copy(output = cleanedOutput)) + padded -> r.output.zip(padded.output) + } + + case r: DataSourceV2Relation => + val projectList = CharVarcharUtils.charTypePadding(r.output) + if (projectList == r.output) { + r -> Nil + } else { + val cleanedOutput = r.output.map(CharVarcharUtils.cleanAttrMetadata) + val padded = Project(projectList, r.copy(output = cleanedOutput)) + padded -> r.output.zip(padded.output) + } + + case r: HiveTableRelation => + val projectList = CharVarcharUtils.charTypePadding(r.output) + if (projectList == r.output) { + r -> Nil + } else { + val cleanedDataCols = r.dataCols.map(CharVarcharUtils.cleanAttrMetadata) + val cleanedPartCols = r.partitionCols.map(CharVarcharUtils.cleanAttrMetadata) + val padded = Project(projectList, + r.copy(dataCols = cleanedDataCols, partitionCols = cleanedPartCols)) + padded -> r.output.zip(padded.output) + } + } + + padded.resolveOperatorsUp { + case operator if operator.resolved => operator.transformExpressionsUp { + // String literal is treated as char type when it's compared to a char type column. + // We should pad the shorter one to the longer length. + case b @ BinaryComparison(attr: Attribute, lit) if lit.foldable => + padAttrLitCmp(attr, lit).map { newChildren => + b.withNewChildren(newChildren) + }.getOrElse(b) + + case b @ BinaryComparison(lit, attr: Attribute) if lit.foldable => + padAttrLitCmp(attr, lit).map { newChildren => + b.withNewChildren(newChildren.reverse) + }.getOrElse(b) + + case i @ In(attr: Attribute, list) + if attr.dataType == StringType && list.forall(_.foldable) => + CharVarcharUtils.getRawType(attr.metadata).flatMap { + case CharType(length) => + val literalCharLengths = list.map(_.eval().asInstanceOf[UTF8String].numChars()) + val targetLen = (length +: literalCharLengths).max + Some(i.copy( + value = addPadding(attr, length, targetLen), + list = list.zip(literalCharLengths).map { + case (lit, charLength) => addPadding(lit, charLength, targetLen) + })) + case _ => None + }.getOrElse(i) + + // For char type column or inner field comparison, pad the shorter one to the longer length. + case b @ BinaryComparison(left: Attribute, right: Attribute) => + b.withNewChildren(CharVarcharUtils.addPaddingInStringComparison(Seq(left, right))) + + case i @ In(attr: Attribute, list) if list.forall(_.isInstanceOf[Attribute]) => + val newChildren = CharVarcharUtils.addPaddingInStringComparison( + attr +: list.map(_.asInstanceOf[Attribute])) + i.copy(value = newChildren.head, list = newChildren.tail) + } + } + } + + private def padAttrLitCmp(attr: Attribute, lit: Expression): Option[Seq[Expression]] = { + if (attr.dataType == StringType) { + CharVarcharUtils.getRawType(attr.metadata).flatMap { + case CharType(length) => + val str = lit.eval().asInstanceOf[UTF8String] + val stringLitLen = str.numChars() + if (length < stringLitLen) { + Some(Seq(StringRPad(attr, Literal(stringLitLen)), lit)) + } else if (length > stringLitLen) { + Some(Seq(attr, StringRPad(lit, Literal(length)))) + } else { + None + } + case _ => None + } + } else { + None + } + } + + private def addPadding(expr: Expression, charLength: Int, targetLength: Int): Expression = { + if (targetLength > charLength) StringRPad(expr, Literal(targetLength)) else expr + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala index 33a3486bf6f67..8c61c8cd4f52e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala @@ -21,7 +21,7 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} -import org.apache.spark.sql.catalyst.util.truncatedString +import org.apache.spark.sql.catalyst.util.{truncatedString, CharVarcharUtils} import org.apache.spark.sql.sources.BaseRelation /** @@ -69,9 +69,17 @@ case class LogicalRelation( } object LogicalRelation { - def apply(relation: BaseRelation, isStreaming: Boolean = false): LogicalRelation = - LogicalRelation(relation, relation.schema.toAttributes, None, isStreaming) + def apply(relation: BaseRelation, isStreaming: Boolean = false): LogicalRelation = { + // The v1 source may return schema containing char/varchar type. We replace char/varchar + // with "annotated" string type here as the query engine doesn't support char/varchar yet. + val schema = CharVarcharUtils.replaceCharVarcharWithStringInSchema(relation.schema) + LogicalRelation(relation, schema.toAttributes, None, isStreaming) + } - def apply(relation: BaseRelation, table: CatalogTable): LogicalRelation = - LogicalRelation(relation, relation.schema.toAttributes, Some(table), false) + def apply(relation: BaseRelation, table: CatalogTable): LogicalRelation = { + // The v1 source may return schema containing char/varchar type. We replace char/varchar + // with "annotated" string type here as the query engine doesn't support char/varchar yet. + val schema = CharVarcharUtils.replaceCharVarcharWithStringInSchema(relation.schema) + LogicalRelation(relation, schema.toAttributes, Some(table), false) + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala index 78f31fb80ecf6..5dd0d2bd74838 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.analysis.Resolver import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow import org.apache.spark.sql.catalyst.parser.CatalystSqlParser -import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils, GenericArrayData} +import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CharVarcharUtils, DateTimeUtils, GenericArrayData} import org.apache.spark.sql.connector.catalog.TableChange import org.apache.spark.sql.execution.datasources.jdbc.connection.ConnectionProvider import org.apache.spark.sql.jdbc.{JdbcDialect, JdbcDialects, JdbcType} @@ -761,17 +761,10 @@ object JdbcUtils extends Logging { schema: StructType, caseSensitive: Boolean, createTableColumnTypes: String): Map[String, String] = { - def typeName(f: StructField): String = { - // char/varchar gets translated to string type. Real data type specified by the user - // is available in the field metadata as HIVE_TYPE_STRING - if (f.metadata.contains(HIVE_TYPE_STRING)) { - f.metadata.getString(HIVE_TYPE_STRING) - } else { - f.dataType.catalogString - } - } - - val userSchema = CatalystSqlParser.parseTableSchema(createTableColumnTypes) + val parsedSchema = CatalystSqlParser.parseTableSchema(createTableColumnTypes) + val userSchema = StructType(parsedSchema.map { field => + field.copy(dataType = CharVarcharUtils.getRawType(field.metadata).getOrElse(field.dataType)) + }) val nameEquality = if (caseSensitive) { org.apache.spark.sql.catalyst.analysis.caseSensitiveResolution } else { @@ -791,7 +784,7 @@ object JdbcUtils extends Logging { } } - val userSchemaMap = userSchema.fields.map(f => f.name -> typeName(f)).toMap + val userSchemaMap = userSchema.fields.map(f => f.name -> f.dataType.catalogString).toMap if (caseSensitive) userSchemaMap else CaseInsensitiveMap(userSchemaMap) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala index ce8edce6f08d6..2208e930f6b08 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.datasources.v2 import scala.collection.mutable import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet, Expression, NamedExpression, PredicateHelper, SchemaPruning} +import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.connector.read.{Scan, ScanBuilder, SupportsPushDownFilters, SupportsPushDownRequiredColumns} import org.apache.spark.sql.execution.datasources.DataSourceStrategy import org.apache.spark.sql.internal.SQLConf @@ -110,7 +111,8 @@ object PushDownUtils extends PredicateHelper { schema: StructType, relation: DataSourceV2Relation): Seq[AttributeReference] = { val nameToAttr = relation.output.map(_.name).zip(relation.output).toMap - schema.toAttributes.map { + val cleaned = CharVarcharUtils.replaceCharVarcharWithString(schema).asInstanceOf[StructType] + cleaned.toAttributes.map { // we have to keep the attribute id during transformation a => a.withExprId(nameToAttr(a.name).exprId) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala index 538a5408723bb..a89a5de3b7e72 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala @@ -189,6 +189,7 @@ abstract class BaseSessionStateBuilder( PreprocessTableCreation(session) +: PreprocessTableInsertion +: DataSourceAnalysis +: + ApplyCharTypePadding +: customPostHocResolutionRules override val extendedCheckRules: Seq[LogicalPlan => Unit] = diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala index 7f4ef8be562fb..eb7bb5c87a990 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala @@ -26,7 +26,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, SparkSession} import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.streaming.StreamingRelationV2 -import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CharVarcharUtils} import org.apache.spark.sql.connector.catalog.{SupportsRead, TableProvider} import org.apache.spark.sql.connector.catalog.TableCapability._ import org.apache.spark.sql.execution.command.DDLUtils @@ -64,7 +64,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * @since 2.0.0 */ def schema(schema: StructType): DataStreamReader = { - this.userSpecifiedSchema = Option(schema) + this.userSpecifiedSchema = Option(CharVarcharUtils.replaceCharVarcharWithStringInSchema(schema)) this } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala new file mode 100644 index 0000000000000..abb13270d20e7 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala @@ -0,0 +1,505 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.sql.catalyst.util.CharVarcharUtils +import org.apache.spark.sql.connector.{InMemoryPartitionTableCatalog, SchemaRequiredDataSource} +import org.apache.spark.sql.execution.datasources.LogicalRelation +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.sources.SimpleInsertSource +import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} +import org.apache.spark.sql.types.{ArrayType, CharType, DataType, MapType, StringType, StructField, StructType} + +// The base trait for char/varchar tests that need to be run with different table implementations. +trait CharVarcharTestSuite extends QueryTest with SQLTestUtils { + + def format: String + + def checkColType(f: StructField, dt: DataType): Unit = { + assert(f.dataType == CharVarcharUtils.replaceCharVarcharWithString(dt)) + assert(CharVarcharUtils.getRawType(f.metadata) == Some(dt)) + } + + test("char type values should be padded: top-level columns") { + withTable("t") { + sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format") + sql("INSERT INTO t VALUES ('1', 'a')") + checkAnswer(spark.table("t"), Row("1", "a" + " " * 4)) + checkColType(spark.table("t").schema(1), CharType(5)) + + sql("INSERT OVERWRITE t VALUES ('1', null)") + checkAnswer(spark.table("t"), Row("1", null)) + } + } + + test("char type values should be padded: partitioned columns") { + withTable("t") { + sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY (c)") + sql("INSERT INTO t VALUES ('1', 'a')") + checkAnswer(spark.table("t"), Row("1", "a" + " " * 4)) + checkColType(spark.table("t").schema(1), CharType(5)) + + sql("ALTER TABLE t DROP PARTITION(c='a')") + sql("INSERT OVERWRITE t VALUES ('1', null)") + checkAnswer(spark.table("t"), Row("1", null)) + } + } + + test("char type values should be padded: nested in struct") { + withTable("t") { + sql(s"CREATE TABLE t(i STRING, c STRUCT) USING $format") + sql("INSERT INTO t VALUES ('1', struct('a'))") + checkAnswer(spark.table("t"), Row("1", Row("a" + " " * 4))) + checkColType(spark.table("t").schema(1), new StructType().add("c", CharType(5))) + + sql("INSERT OVERWRITE t VALUES ('1', null)") + checkAnswer(spark.table("t"), Row("1", null)) + sql("INSERT OVERWRITE t VALUES ('1', struct(null))") + checkAnswer(spark.table("t"), Row("1", Row(null))) + } + } + + test("char type values should be padded: nested in array") { + withTable("t") { + sql(s"CREATE TABLE t(i STRING, c ARRAY) USING $format") + sql("INSERT INTO t VALUES ('1', array('a', 'ab'))") + checkAnswer(spark.table("t"), Row("1", Seq("a" + " " * 4, "ab" + " " * 3))) + checkColType(spark.table("t").schema(1), ArrayType(CharType(5))) + + sql("INSERT OVERWRITE t VALUES ('1', null)") + checkAnswer(spark.table("t"), Row("1", null)) + sql("INSERT OVERWRITE t VALUES ('1', array(null))") + checkAnswer(spark.table("t"), Row("1", Seq(null))) + } + } + + test("char type values should be padded: nested in map key") { + withTable("t") { + sql(s"CREATE TABLE t(i STRING, c MAP) USING $format") + sql("INSERT INTO t VALUES ('1', map('a', 'ab'))") + checkAnswer(spark.table("t"), Row("1", Map(("a" + " " * 4, "ab")))) + checkColType(spark.table("t").schema(1), MapType(CharType(5), StringType)) + + sql("INSERT OVERWRITE t VALUES ('1', null)") + checkAnswer(spark.table("t"), Row("1", null)) + } + } + + test("char type values should be padded: nested in map value") { + withTable("t") { + sql(s"CREATE TABLE t(i STRING, c MAP) USING $format") + sql("INSERT INTO t VALUES ('1', map('a', 'ab'))") + checkAnswer(spark.table("t"), Row("1", Map(("a", "ab" + " " * 3)))) + checkColType(spark.table("t").schema(1), MapType(StringType, CharType(5))) + + sql("INSERT OVERWRITE t VALUES ('1', null)") + checkAnswer(spark.table("t"), Row("1", null)) + sql("INSERT OVERWRITE t VALUES ('1', map('a', null))") + checkAnswer(spark.table("t"), Row("1", Map("a" -> null))) + } + } + + test("char type values should be padded: nested in both map key and value") { + withTable("t") { + sql(s"CREATE TABLE t(i STRING, c MAP) USING $format") + sql("INSERT INTO t VALUES ('1', map('a', 'ab'))") + checkAnswer(spark.table("t"), Row("1", Map(("a" + " " * 4, "ab" + " " * 8)))) + checkColType(spark.table("t").schema(1), MapType(CharType(5), CharType(10))) + + sql("INSERT OVERWRITE t VALUES ('1', null)") + checkAnswer(spark.table("t"), Row("1", null)) + } + } + + test("char type values should be padded: nested in struct of array") { + withTable("t") { + sql(s"CREATE TABLE t(i STRING, c STRUCT>) USING $format") + sql("INSERT INTO t VALUES ('1', struct(array('a', 'ab')))") + checkAnswer(spark.table("t"), Row("1", Row(Seq("a" + " " * 4, "ab" + " " * 3)))) + checkColType(spark.table("t").schema(1), + new StructType().add("c", ArrayType(CharType(5)))) + + sql("INSERT OVERWRITE t VALUES ('1', null)") + checkAnswer(spark.table("t"), Row("1", null)) + sql("INSERT OVERWRITE t VALUES ('1', struct(null))") + checkAnswer(spark.table("t"), Row("1", Row(null))) + sql("INSERT OVERWRITE t VALUES ('1', struct(array(null)))") + checkAnswer(spark.table("t"), Row("1", Row(Seq(null)))) + } + } + + test("char type values should be padded: nested in array of struct") { + withTable("t") { + sql(s"CREATE TABLE t(i STRING, c ARRAY>) USING $format") + sql("INSERT INTO t VALUES ('1', array(struct('a'), struct('ab')))") + checkAnswer(spark.table("t"), Row("1", Seq(Row("a" + " " * 4), Row("ab" + " " * 3)))) + checkColType(spark.table("t").schema(1), + ArrayType(new StructType().add("c", CharType(5)))) + + sql("INSERT OVERWRITE t VALUES ('1', null)") + checkAnswer(spark.table("t"), Row("1", null)) + sql("INSERT OVERWRITE t VALUES ('1', array(null))") + checkAnswer(spark.table("t"), Row("1", Seq(null))) + sql("INSERT OVERWRITE t VALUES ('1', array(struct(null)))") + checkAnswer(spark.table("t"), Row("1", Seq(Row(null)))) + } + } + + test("char type values should be padded: nested in array of array") { + withTable("t") { + sql(s"CREATE TABLE t(i STRING, c ARRAY>) USING $format") + sql("INSERT INTO t VALUES ('1', array(array('a', 'ab')))") + checkAnswer(spark.table("t"), Row("1", Seq(Seq("a" + " " * 4, "ab" + " " * 3)))) + checkColType(spark.table("t").schema(1), ArrayType(ArrayType(CharType(5)))) + + sql("INSERT OVERWRITE t VALUES ('1', null)") + checkAnswer(spark.table("t"), Row("1", null)) + sql("INSERT OVERWRITE t VALUES ('1', array(null))") + checkAnswer(spark.table("t"), Row("1", Seq(null))) + sql("INSERT OVERWRITE t VALUES ('1', array(array(null)))") + checkAnswer(spark.table("t"), Row("1", Seq(Seq(null)))) + } + } + + private def testTableWrite(f: String => Unit): Unit = { + withTable("t") { f("char") } + withTable("t") { f("varchar") } + } + + test("length check for input string values: top-level columns") { + testTableWrite { typeName => + sql(s"CREATE TABLE t(c $typeName(5)) USING $format") + sql("INSERT INTO t VALUES (null)") + checkAnswer(spark.table("t"), Row(null)) + val e = intercept[SparkException](sql("INSERT INTO t VALUES ('123456')")) + assert(e.getCause.getMessage.contains( + s"input string '123456' exceeds $typeName type length limitation: 5")) + } + } + + test("length check for input string values: partitioned columns") { + // DS V2 doesn't support partitioned table. + if (!conf.contains(SQLConf.DEFAULT_CATALOG.key)) { + testTableWrite { typeName => + sql(s"CREATE TABLE t(i INT, c $typeName(5)) USING $format PARTITIONED BY (c)") + sql("INSERT INTO t VALUES (1, null)") + checkAnswer(spark.table("t"), Row(1, null)) + val e = intercept[SparkException](sql("INSERT INTO t VALUES (1, '123456')")) + assert(e.getCause.getMessage.contains( + s"input string '123456' exceeds $typeName type length limitation: 5")) + } + } + } + + test("length check for input string values: nested in struct") { + testTableWrite { typeName => + sql(s"CREATE TABLE t(c STRUCT) USING $format") + sql("INSERT INTO t SELECT struct(null)") + checkAnswer(spark.table("t"), Row(Row(null))) + val e = intercept[SparkException](sql("INSERT INTO t SELECT struct('123456')")) + assert(e.getCause.getMessage.contains( + s"input string '123456' exceeds $typeName type length limitation: 5")) + } + } + + test("length check for input string values: nested in array") { + testTableWrite { typeName => + sql(s"CREATE TABLE t(c ARRAY<$typeName(5)>) USING $format") + sql("INSERT INTO t VALUES (array(null))") + checkAnswer(spark.table("t"), Row(Seq(null))) + val e = intercept[SparkException](sql("INSERT INTO t VALUES (array('a', '123456'))")) + assert(e.getCause.getMessage.contains( + s"input string '123456' exceeds $typeName type length limitation: 5")) + } + } + + test("length check for input string values: nested in map key") { + testTableWrite { typeName => + sql(s"CREATE TABLE t(c MAP<$typeName(5), STRING>) USING $format") + val e = intercept[SparkException](sql("INSERT INTO t VALUES (map('123456', 'a'))")) + assert(e.getCause.getMessage.contains( + s"input string '123456' exceeds $typeName type length limitation: 5")) + } + } + + test("length check for input string values: nested in map value") { + testTableWrite { typeName => + sql(s"CREATE TABLE t(c MAP) USING $format") + sql("INSERT INTO t VALUES (map('a', null))") + checkAnswer(spark.table("t"), Row(Map("a" -> null))) + val e = intercept[SparkException](sql("INSERT INTO t VALUES (map('a', '123456'))")) + assert(e.getCause.getMessage.contains( + s"input string '123456' exceeds $typeName type length limitation: 5")) + } + } + + test("length check for input string values: nested in both map key and value") { + testTableWrite { typeName => + sql(s"CREATE TABLE t(c MAP<$typeName(5), $typeName(5)>) USING $format") + val e1 = intercept[SparkException](sql("INSERT INTO t VALUES (map('123456', 'a'))")) + assert(e1.getCause.getMessage.contains( + s"input string '123456' exceeds $typeName type length limitation: 5")) + val e2 = intercept[SparkException](sql("INSERT INTO t VALUES (map('a', '123456'))")) + assert(e2.getCause.getMessage.contains( + s"input string '123456' exceeds $typeName type length limitation: 5")) + } + } + + test("length check for input string values: nested in struct of array") { + testTableWrite { typeName => + sql(s"CREATE TABLE t(c STRUCT>) USING $format") + sql("INSERT INTO t SELECT struct(array(null))") + checkAnswer(spark.table("t"), Row(Row(Seq(null)))) + val e = intercept[SparkException](sql("INSERT INTO t SELECT struct(array('123456'))")) + assert(e.getCause.getMessage.contains( + s"input string '123456' exceeds $typeName type length limitation: 5")) + } + } + + test("length check for input string values: nested in array of struct") { + testTableWrite { typeName => + sql(s"CREATE TABLE t(c ARRAY>) USING $format") + sql("INSERT INTO t VALUES (array(struct(null)))") + checkAnswer(spark.table("t"), Row(Seq(Row(null)))) + val e = intercept[SparkException](sql("INSERT INTO t VALUES (array(struct('123456')))")) + assert(e.getCause.getMessage.contains( + s"input string '123456' exceeds $typeName type length limitation: 5")) + } + } + + test("length check for input string values: nested in array of array") { + testTableWrite { typeName => + sql(s"CREATE TABLE t(c ARRAY>) USING $format") + sql("INSERT INTO t VALUES (array(array(null)))") + checkAnswer(spark.table("t"), Row(Seq(Seq(null)))) + val e = intercept[SparkException](sql("INSERT INTO t VALUES (array(array('123456')))")) + assert(e.getCause.getMessage.contains( + s"input string '123456' exceeds $typeName type length limitation: 5")) + } + } + + test("length check for input string values: with trailing spaces") { + withTable("t") { + sql(s"CREATE TABLE t(c1 CHAR(5), c2 VARCHAR(5)) USING $format") + sql("INSERT INTO t VALUES ('12 ', '12 ')") + sql("INSERT INTO t VALUES ('1234 ', '1234 ')") + checkAnswer(spark.table("t"), Seq( + Row("12" + " " * 3, "12 "), + Row("1234 ", "1234 "))) + } + } + + test("length check for input string values: with implicit cast") { + withTable("t") { + sql(s"CREATE TABLE t(c1 CHAR(5), c2 VARCHAR(5)) USING $format") + sql("INSERT INTO t VALUES (1234, 1234)") + checkAnswer(spark.table("t"), Row("1234 ", "1234")) + val e1 = intercept[SparkException](sql("INSERT INTO t VALUES (123456, 1)")) + assert(e1.getCause.getMessage.contains( + "input string '123456' exceeds char type length limitation: 5")) + val e2 = intercept[SparkException](sql("INSERT INTO t VALUES (1, 123456)")) + assert(e2.getCause.getMessage.contains( + "input string '123456' exceeds varchar type length limitation: 5")) + } + } + + private def testConditions(df: DataFrame, conditions: Seq[(String, Boolean)]): Unit = { + checkAnswer(df.selectExpr(conditions.map(_._1): _*), Row.fromSeq(conditions.map(_._2))) + } + + test("char type comparison: top-level columns") { + withTable("t") { + sql(s"CREATE TABLE t(c1 CHAR(2), c2 CHAR(5)) USING $format") + sql("INSERT INTO t VALUES ('a', 'a')") + testConditions(spark.table("t"), Seq( + ("c1 = 'a'", true), + ("'a' = c1", true), + ("c1 = 'a '", true), + ("c1 > 'a'", false), + ("c1 IN ('a', 'b')", true), + ("c1 = c2", true), + ("c1 < c2", false), + ("c1 IN (c2)", true))) + } + } + + test("char type comparison: partitioned columns") { + withTable("t") { + sql(s"CREATE TABLE t(i INT, c1 CHAR(2), c2 CHAR(5)) USING $format PARTITIONED BY (c1, c2)") + sql("INSERT INTO t VALUES (1, 'a', 'a')") + testConditions(spark.table("t"), Seq( + ("c1 = 'a'", true), + ("'a' = c1", true), + ("c1 = 'a '", true), + ("c1 > 'a'", false), + ("c1 IN ('a', 'b')", true), + ("c1 = c2", true), + ("c1 < c2", false), + ("c1 IN (c2)", true))) + } + } + + test("char type comparison: join") { + withTable("t1", "t2") { + sql(s"CREATE TABLE t1(c CHAR(2)) USING $format") + sql(s"CREATE TABLE t2(c CHAR(5)) USING $format") + sql("INSERT INTO t1 VALUES ('a')") + sql("INSERT INTO t2 VALUES ('a')") + checkAnswer(sql("SELECT t1.c FROM t1 JOIN t2 ON t1.c = t2.c"), Row("a ")) + } + } + + test("char type comparison: nested in struct") { + withTable("t") { + sql(s"CREATE TABLE t(c1 STRUCT, c2 STRUCT) USING $format") + sql("INSERT INTO t VALUES (struct('a'), struct('a'))") + testConditions(spark.table("t"), Seq( + ("c1 = c2", true), + ("c1 < c2", false), + ("c1 IN (c2)", true))) + } + } + + test("char type comparison: nested in array") { + withTable("t") { + sql(s"CREATE TABLE t(c1 ARRAY, c2 ARRAY) USING $format") + sql("INSERT INTO t VALUES (array('a', 'b'), array('a', 'b'))") + testConditions(spark.table("t"), Seq( + ("c1 = c2", true), + ("c1 < c2", false), + ("c1 IN (c2)", true))) + } + } + + test("char type comparison: nested in struct of array") { + withTable("t") { + sql("CREATE TABLE t(c1 STRUCT>, c2 STRUCT>) " + + s"USING $format") + sql("INSERT INTO t VALUES (struct(array('a', 'b')), struct(array('a', 'b')))") + testConditions(spark.table("t"), Seq( + ("c1 = c2", true), + ("c1 < c2", false), + ("c1 IN (c2)", true))) + } + } + + test("char type comparison: nested in array of struct") { + withTable("t") { + sql("CREATE TABLE t(c1 ARRAY>, c2 ARRAY>) " + + s"USING $format") + sql("INSERT INTO t VALUES (array(struct('a')), array(struct('a')))") + testConditions(spark.table("t"), Seq( + ("c1 = c2", true), + ("c1 < c2", false), + ("c1 IN (c2)", true))) + } + } + + test("char type comparison: nested in array of array") { + withTable("t") { + sql("CREATE TABLE t(c1 ARRAY>, c2 ARRAY>) " + + s"USING $format") + sql("INSERT INTO t VALUES (array(array('a')), array(array('a')))") + testConditions(spark.table("t"), Seq( + ("c1 = c2", true), + ("c1 < c2", false), + ("c1 IN (c2)", true))) + } + } +} + +// Some basic char/varchar tests which doesn't rely on table implementation. +class BasicCharVarcharTestSuite extends QueryTest with SharedSparkSession { + import testImplicits._ + + test("user-specified schema in cast") { + def assertNoCharType(df: DataFrame): Unit = { + checkAnswer(df, Row("0")) + assert(df.schema.map(_.dataType) == Seq(StringType)) + } + + assertNoCharType(spark.range(1).select($"id".cast("char(5)"))) + assertNoCharType(spark.range(1).select($"id".cast(CharType(5)))) + assertNoCharType(spark.range(1).selectExpr("CAST(id AS CHAR(5))")) + assertNoCharType(sql("SELECT CAST(id AS CHAR(5)) FROM range(1)")) + } + + test("user-specified schema in functions") { + val df = sql("""SELECT from_json('{"a": "str"}', 'a CHAR(5)')""") + checkAnswer(df, Row(Row("str"))) + val schema = df.schema.head.dataType.asInstanceOf[StructType] + assert(schema.map(_.dataType) == Seq(StringType)) + } + + test("user-specified schema in DataFrameReader: file source from Dataset") { + val ds = spark.range(10).map(_.toString) + val df1 = spark.read.schema(new StructType().add("id", CharType(5))).csv(ds) + assert(df1.schema.map(_.dataType) == Seq(StringType)) + val df2 = spark.read.schema("id char(5)").csv(ds) + assert(df2.schema.map(_.dataType) == Seq(StringType)) + } + + test("user-specified schema in DataFrameReader: DSV1") { + def checkSchema(df: DataFrame): Unit = { + val relations = df.queryExecution.analyzed.collect { + case l: LogicalRelation => l.relation + } + assert(relations.length == 1) + assert(relations.head.schema.map(_.dataType) == Seq(StringType)) + } + + checkSchema(spark.read.schema(new StructType().add("id", CharType(5))) + .format(classOf[SimpleInsertSource].getName).load()) + checkSchema(spark.read.schema("id char(5)") + .format(classOf[SimpleInsertSource].getName).load()) + } + + test("user-specified schema in DataFrameReader: DSV2") { + def checkSchema(df: DataFrame): Unit = { + val tables = df.queryExecution.analyzed.collect { + case d: DataSourceV2Relation => d.table + } + assert(tables.length == 1) + assert(tables.head.schema.map(_.dataType) == Seq(StringType)) + } + + checkSchema(spark.read.schema(new StructType().add("id", CharType(5))) + .format(classOf[SchemaRequiredDataSource].getName).load()) + checkSchema(spark.read.schema("id char(5)") + .format(classOf[SchemaRequiredDataSource].getName).load()) + } +} + +class FileSourceCharVarcharTestSuite extends CharVarcharTestSuite with SharedSparkSession { + override def format: String = "parquet" + override protected def sparkConf: SparkConf = { + super.sparkConf.set(SQLConf.USE_V1_SOURCE_LIST, "parquet") + } +} + +class DSV2CharVarcharTestSuite extends CharVarcharTestSuite + with SharedSparkSession { + override def format: String = "foo" + protected override def sparkConf = { + super.sparkConf + .set("spark.sql.catalog.testcat", classOf[InMemoryPartitionTableCatalog].getName) + .set(SQLConf.DEFAULT_CATALOG.key, "testcat") + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala index 9710fca6bc82c..20cad721d3d0e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala @@ -38,7 +38,7 @@ import org.apache.spark.sql.execution.datasources.CreateTable import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} import org.apache.spark.sql.sources.SimpleScanSource -import org.apache.spark.sql.types.{CharType, DoubleType, HIVE_TYPE_STRING, IntegerType, LongType, MetadataBuilder, StringType, StructField, StructType} +import org.apache.spark.sql.types.{CharType, DoubleType, IntegerType, LongType, StringType, StructField, StructType} class PlanResolutionSuite extends AnalysisTest { import CatalystSqlParser._ @@ -1090,9 +1090,7 @@ class PlanResolutionSuite extends AnalysisTest { } val sql = s"ALTER TABLE v1HiveTable ALTER COLUMN i TYPE char(1)" - val builder = new MetadataBuilder - builder.putString(HIVE_TYPE_STRING, CharType(1).catalogString) - val newColumnWithCleanedType = StructField("i", StringType, true, builder.build()) + val newColumnWithCleanedType = StructField("i", CharType(1), true) val expected = AlterTableChangeColumnCommand( TableIdentifier("v1HiveTable", Some("default")), "i", newColumnWithCleanedType) val parsed = parseAndResolve(sql) @@ -1533,44 +1531,6 @@ class PlanResolutionSuite extends AnalysisTest { } } - test("SPARK-31147: forbid CHAR type in non-Hive tables") { - def checkFailure(t: String, provider: String): Unit = { - val types = Seq( - "CHAR(2)", - "ARRAY", - "MAP", - "MAP", - "STRUCT") - types.foreach { tpe => - intercept[AnalysisException] { - parseAndResolve(s"CREATE TABLE $t(col $tpe) USING $provider") - } - intercept[AnalysisException] { - parseAndResolve(s"REPLACE TABLE $t(col $tpe) USING $provider") - } - intercept[AnalysisException] { - parseAndResolve(s"CREATE OR REPLACE TABLE $t(col $tpe) USING $provider") - } - intercept[AnalysisException] { - parseAndResolve(s"ALTER TABLE $t ADD COLUMN col $tpe") - } - intercept[AnalysisException] { - parseAndResolve(s"ALTER TABLE $t ADD COLUMN col $tpe") - } - intercept[AnalysisException] { - parseAndResolve(s"ALTER TABLE $t ALTER COLUMN col TYPE $tpe") - } - intercept[AnalysisException] { - parseAndResolve(s"ALTER TABLE $t REPLACE COLUMNS (col $tpe)") - } - } - } - - checkFailure("v1Table", v1Format) - checkFailure("v2Table", v2Format) - checkFailure("testcat.tab", "foo") - } - private def compareNormalized(plan1: LogicalPlan, plan2: LogicalPlan): Unit = { /** * Normalizes plans: diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala index 9a95bf770772e..ca3e714665818 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala @@ -22,6 +22,7 @@ import java.sql.{Date, Timestamp} import org.apache.spark.rdd.RDD import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ @@ -127,7 +128,7 @@ class TableScanSuite extends DataSourceTest with SharedSparkSession { Date.valueOf("1970-01-01"), new Timestamp(20000 + i), s"varchar_$i", - s"char_$i", + s"char_$i".padTo(18, ' '), Seq(i, i + 1), Seq(Map(s"str_$i" -> Row(i.toLong))), Map(i -> i.toString), @@ -206,10 +207,6 @@ class TableScanSuite extends DataSourceTest with SharedSparkSession { (2 to 10).map(i => Row(i, i - 1)).toSeq) test("Schema and all fields") { - def hiveMetadata(dt: String): Metadata = { - new MetadataBuilder().putString(HIVE_TYPE_STRING, dt).build() - } - val expectedSchema = StructType( StructField("string$%Field", StringType, true) :: StructField("binaryField", BinaryType, true) :: @@ -224,8 +221,8 @@ class TableScanSuite extends DataSourceTest with SharedSparkSession { StructField("decimalField2", DecimalType(9, 2), true) :: StructField("dateField", DateType, true) :: StructField("timestampField", TimestampType, true) :: - StructField("varcharField", StringType, true, hiveMetadata("varchar(12)")) :: - StructField("charField", StringType, true, hiveMetadata("char(18)")) :: + StructField("varcharField", VarcharType(12), true) :: + StructField("charField", CharType(18), true) :: StructField("arrayFieldSimple", ArrayType(IntegerType), true) :: StructField("arrayFieldComplex", ArrayType( @@ -248,7 +245,8 @@ class TableScanSuite extends DataSourceTest with SharedSparkSession { Nil ) - assert(expectedSchema == spark.table("tableWithSchema").schema) + assert(CharVarcharUtils.replaceCharVarcharWithStringInSchema(expectedSchema) == + spark.table("tableWithSchema").schema) withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") { checkAnswer( diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala index b30492802495f..da37b61688951 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala @@ -90,6 +90,7 @@ class HiveSessionStateBuilder( PreprocessTableCreation(session) +: PreprocessTableInsertion +: DataSourceAnalysis +: + ApplyCharTypePadding +: HiveAnalysis +: customPostHocResolutionRules diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index b2f0867114bae..bada131c8ba6d 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -978,19 +978,14 @@ private[hive] class HiveClientImpl( private[hive] object HiveClientImpl extends Logging { /** Converts the native StructField to Hive's FieldSchema. */ def toHiveColumn(c: StructField): FieldSchema = { - val typeString = if (c.metadata.contains(HIVE_TYPE_STRING)) { - c.metadata.getString(HIVE_TYPE_STRING) - } else { - // replace NullType to HiveVoidType since Hive parse void not null. - HiveVoidType.replaceVoidType(c.dataType).catalogString - } + val typeString = HiveVoidType.replaceVoidType(c.dataType).catalogString new FieldSchema(c.name, typeString, c.getComment().orNull) } /** Get the Spark SQL native DataType from Hive's FieldSchema. */ private def getSparkSQLDataType(hc: FieldSchema): DataType = { try { - CatalystSqlParser.parseDataType(hc.getType) + CatalystSqlParser.parseRawDataType(hc.getType) } catch { case e: ParseException => throw new SparkException( @@ -1001,18 +996,10 @@ private[hive] object HiveClientImpl extends Logging { /** Builds the native StructField from Hive's FieldSchema. */ def fromHiveColumn(hc: FieldSchema): StructField = { val columnType = getSparkSQLDataType(hc) - val replacedVoidType = HiveVoidType.replaceVoidType(columnType) - val metadata = if (hc.getType != replacedVoidType.catalogString) { - new MetadataBuilder().putString(HIVE_TYPE_STRING, hc.getType).build() - } else { - Metadata.empty - } - val field = StructField( name = hc.getName, dataType = columnType, - nullable = true, - metadata = metadata) + nullable = true) Option(hc.getComment).map(field.withComment).getOrElse(field) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/HiveCharVarcharTestSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/HiveCharVarcharTestSuite.scala new file mode 100644 index 0000000000000..55d305fda4f96 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/HiveCharVarcharTestSuite.scala @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.sql.hive.test.TestHiveSingleton + +class HiveCharVarcharTestSuite extends CharVarcharTestSuite with TestHiveSingleton { + + // The default Hive serde doesn't support nested null values. + override def format: String = "hive OPTIONS(fileFormat='parquet')" + + private var originalPartitionMode = "" + + override protected def beforeAll(): Unit = { + super.beforeAll() + originalPartitionMode = spark.conf.get("hive.exec.dynamic.partition.mode", "") + spark.conf.set("hive.exec.dynamic.partition.mode", "nonstrict") + } + + override protected def afterAll(): Unit = { + if (originalPartitionMode == "") { + spark.conf.unset("hive.exec.dynamic.partition.mode") + } else { + spark.conf.set("hive.exec.dynamic.partition.mode", originalPartitionMode) + } + super.afterAll() + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala index 8f71ba3337aa2..1a6f6843d3911 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala @@ -113,24 +113,19 @@ class HiveMetastoreCatalogSuite extends TestHiveSingleton with SQLTestUtils { .add("c9", "date") .add("c10", "timestamp") .add("c11", "string") - .add("c12", "string", true, - new MetadataBuilder().putString(HIVE_TYPE_STRING, "char(10)").build()) - .add("c13", "string", true, - new MetadataBuilder().putString(HIVE_TYPE_STRING, "varchar(10)").build()) + .add("c12", CharType(10), true) + .add("c13", VarcharType(10), true) .add("c14", "binary") .add("c15", "decimal") .add("c16", "decimal(10)") .add("c17", "decimal(10,2)") .add("c18", "array") .add("c19", "array") - .add("c20", "array", true, - new MetadataBuilder().putString(HIVE_TYPE_STRING, "array").build()) + .add("c20", ArrayType(CharType(10)), true) .add("c21", "map") - .add("c22", "map", true, - new MetadataBuilder().putString(HIVE_TYPE_STRING, "map").build()) + .add("c22", MapType(IntegerType, CharType(10)), true) .add("c23", "struct") - .add("c24", "struct", true, - new MetadataBuilder().putString(HIVE_TYPE_STRING, "struct").build()) + .add("c24", new StructType().add("c", VarcharType(10)).add("d", "int"), true) assert(schema == expectedSchema) } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index b8b1da4cb9db7..2dfb8bb552594 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -2251,8 +2251,8 @@ class HiveDDLSuite ) sql("ALTER TABLE tab ADD COLUMNS (c5 char(10))") - assert(spark.table("tab").schema.find(_.name == "c5") - .get.metadata.getString("HIVE_TYPE_STRING") == "char(10)") + assert(spark.sharedState.externalCatalog.getTable("default", "tab") + .schema.find(_.name == "c5").get.dataType == CharType(10)) } } } From 6e5446e61f278e9afac342e8f33905f5630aa7d5 Mon Sep 17 00:00:00 2001 From: Pascal Gillet Date: Mon, 30 Nov 2020 19:31:42 +0900 Subject: [PATCH 048/150] [SPARK-33579][UI] Fix executor blank page behind proxy ### What changes were proposed in this pull request? Fix some "hardcoded" API urls in Web UI. More specifically, we avoid the use of `location.origin` when constructing URLs for internal API calls within the JavaScript. Instead, we use `apiRoot` global variable. ### Why are the changes needed? On one hand, it allows us to build relative URLs. On the other hand, `apiRoot` reflects the Spark property `spark.ui.proxyBase` which can be set to change the root path of the Web UI. If `spark.ui.proxyBase` is actually set, original URLs become incorrect, and we end up with an executors blank page. I encounter this bug when accessing the Web UI behind a proxy (in my case a Kubernetes Ingress). See the following link for more context: https://github.com/jupyterhub/jupyter-server-proxy/issues/57#issuecomment-699163115 ### Does this PR introduce _any_ user-facing change? Yes, as all the changes introduced are in the JavaScript for the Web UI. ### How the changes have been tested ? I modified/debugged the JavaScript as in the commit with the help of the developer tools in Google Chrome, while accessing the Web UI of my Spark app behind my k8s ingress. Closes #30523 from pgillet/fix-executors-blank-page-behind-proxy. Authored-by: Pascal Gillet Signed-off-by: Kousuke Saruta --- .../main/resources/org/apache/spark/ui/static/stagepage.js | 2 +- core/src/main/resources/org/apache/spark/ui/static/utils.js | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/core/src/main/resources/org/apache/spark/ui/static/stagepage.js b/core/src/main/resources/org/apache/spark/ui/static/stagepage.js index ee1115868f69b..2877aa819ab9e 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/stagepage.js +++ b/core/src/main/resources/org/apache/spark/ui/static/stagepage.js @@ -70,7 +70,7 @@ function stageEndPoint(appId) { return newBaseURI + "/api/v1/applications/" + appId + "/" + appAttemptId + "/stages/" + stageId; } } - return location.origin + "/api/v1/applications/" + appId + "/stages/" + stageId; + return uiRoot + "/api/v1/applications/" + appId + "/stages/" + stageId; } function getColumnNameForTaskMetricSummary(columnKey) { diff --git a/core/src/main/resources/org/apache/spark/ui/static/utils.js b/core/src/main/resources/org/apache/spark/ui/static/utils.js index 7e6dd678e2641..f4914f000e705 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/utils.js +++ b/core/src/main/resources/org/apache/spark/ui/static/utils.js @@ -105,7 +105,7 @@ function getStandAloneAppId(cb) { } // Looks like Web UI is running in standalone mode // Let's get application-id using REST End Point - $.getJSON(location.origin + "/api/v1/applications", function(response, status, jqXHR) { + $.getJSON(uiRoot + "/api/v1/applications", function(response, status, jqXHR) { if (response && response.length > 0) { var appId = response[0].id; cb(appId); @@ -152,7 +152,7 @@ function createTemplateURI(appId, templateName) { var baseURI = words.slice(0, ind).join('/') + '/static/' + templateName + '-template.html'; return baseURI; } - return location.origin + "/static/" + templateName + "-template.html"; + return uiRoot + "/static/" + templateName + "-template.html"; } function setDataTableDefaults() { @@ -193,5 +193,5 @@ function createRESTEndPointForExecutorsPage(appId) { return newBaseURI + "/api/v1/applications/" + appId + "/" + attemptId + "/allexecutors"; } } - return location.origin + "/api/v1/applications/" + appId + "/allexecutors"; + return uiRoot + "/api/v1/applications/" + appId + "/allexecutors"; } From 0a612b6a40696ed8ce00997ebb4e76d05adbbd82 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Mon, 30 Nov 2020 13:45:53 +0000 Subject: [PATCH 049/150] [SPARK-33452][SQL] Support v2 SHOW PARTITIONS ### What changes were proposed in this pull request? 1. Remove V2 logical node `ShowPartitionsStatement `, and replace it by V2 `ShowPartitions`. 2. Implement V2 execution node `ShowPartitionsExec` similar to V1 `ShowPartitionsCommand`. ### Why are the changes needed? To have feature parity with Datasource V1. ### Does this PR introduce _any_ user-facing change? Yes. Before the change, `SHOW PARTITIONS` fails in V2 table catalogs with the exception: ``` org.apache.spark.sql.AnalysisException: SHOW PARTITIONS is only supported with v1 tables. at org.apache.spark.sql.catalyst.analysis.ResolveSessionCatalog.org$apache$spark$sql$catalyst$analysis$ResolveSessionCatalog$$parseV1Table(ResolveSessionCatalog.scala:628) at org.apache.spark.sql.catalyst.analysis.ResolveSessionCatalog$$anonfun$apply$1.applyOrElse(ResolveSessionCatalog.scala:466) ``` ### How was this patch tested? By running the following test suites: 1. Modified `ShowPartitionsParserSuite` where `ShowPartitionsStatement` is replaced by V2 `ShowPartitions`. 2. `v2.ShowPartitionsSuite` Closes #30398 from MaxGekk/show-partitions-exec-v2. Authored-by: Max Gekk Signed-off-by: Wenchen Fan --- .../sql/catalyst/analysis/Analyzer.scala | 2 +- .../sql/catalyst/analysis/CheckAnalysis.scala | 14 ++ .../analysis/ResolvePartitionSpec.scala | 31 +++-- .../catalyst/analysis/v2ResolutionPlans.scala | 3 +- .../sql/catalyst/parser/AstBuilder.scala | 9 +- .../catalyst/plans/logical/statements.scala | 7 - .../catalyst/plans/logical/v2Commands.scala | 15 +++ .../analysis/ResolveSessionCatalog.scala | 9 +- .../v2/AlterTableAddPartitionExec.scala | 8 +- .../v2/AlterTableDropPartitionExec.scala | 2 +- .../datasources/v2/DataSourceV2Strategy.scala | 11 +- .../datasources/v2/ShowPartitionsExec.scala | 65 ++++++++++ .../sql/connector/DataSourceV2SQLSuite.scala | 1 - .../command/ShowPartitionsParserSuite.scala | 23 ++-- .../command/ShowPartitionsSuiteBase.scala | 120 +++++++++++++++++- .../command/v1/ShowPartitionsSuite.scala | 110 +++------------- .../command/v2/ShowPartitionsSuite.scala | 38 +++--- .../hive/PartitionedTablePerfStatsSuite.scala | 4 +- 18 files changed, 309 insertions(+), 163 deletions(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowPartitionsExec.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 23a1b7bdde93c..abd38f2f9d940 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -1085,7 +1085,7 @@ class Analyzer(override val catalogManager: CatalogManager) lookupTableOrView(identifier).map { case v: ResolvedView => val viewStr = if (v.isTemp) "temp view" else "view" - u.failAnalysis(s"${v.identifier.quoted} is a $viewStr. '$cmd' expects a table.'") + u.failAnalysis(s"${v.identifier.quoted} is a $viewStr. '$cmd' expects a table.") case table => table }.getOrElse(u) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 2818ba58075cd..61ac6346ff944 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -577,6 +577,8 @@ trait CheckAnalysis extends PredicateHelper { case AlterTableDropPartition(ResolvedTable(_, _, table), parts, _, _, _) => checkAlterTablePartition(table, parts) + case showPartitions: ShowPartitions => checkShowPartitions(showPartitions) + case _ => // Fallbacks to the following checks } @@ -1009,4 +1011,16 @@ trait CheckAnalysis extends PredicateHelper { case _ => } } + + // Make sure that the `SHOW PARTITIONS` command is allowed for the table + private def checkShowPartitions(showPartitions: ShowPartitions): Unit = showPartitions match { + case ShowPartitions(rt: ResolvedTable, _) + if !rt.table.isInstanceOf[SupportsPartitionManagement] => + failAnalysis(s"SHOW PARTITIONS cannot run for a table which does not support partitioning") + case ShowPartitions(ResolvedTable(_, _, partTable: SupportsPartitionManagement), _) + if partTable.partitionSchema().isEmpty => + failAnalysis( + s"SHOW PARTITIONS is not allowed on a table that is not partitioned: ${partTable.name()}") + case _ => + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolvePartitionSpec.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolvePartitionSpec.scala index 98c6872a47cc6..38991a9e24fa8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolvePartitionSpec.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolvePartitionSpec.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Cast, Literal} -import org.apache.spark.sql.catalyst.plans.logical.{AlterTableAddPartition, AlterTableDropPartition, LogicalPlan} +import org.apache.spark.sql.catalyst.plans.logical.{AlterTableAddPartition, AlterTableDropPartition, LogicalPlan, ShowPartitions} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.connector.catalog.SupportsPartitionManagement @@ -40,6 +40,12 @@ object ResolvePartitionSpec extends Rule[LogicalPlan] { case r @ AlterTableDropPartition( ResolvedTable(_, _, table: SupportsPartitionManagement), partSpecs, _, _, _) => r.copy(parts = resolvePartitionSpecs(table.name, partSpecs, table.partitionSchema())) + + case r @ ShowPartitions(ResolvedTable(_, _, table: SupportsPartitionManagement), partSpecs) => + r.copy(pattern = resolvePartitionSpecs( + table.name, + partSpecs.toSeq, + table.partitionSchema()).headOption) } private def resolvePartitionSpecs( @@ -48,25 +54,26 @@ object ResolvePartitionSpec extends Rule[LogicalPlan] { partSchema: StructType): Seq[ResolvedPartitionSpec] = partSpecs.map { case unresolvedPartSpec: UnresolvedPartitionSpec => + val normalizedSpec = normalizePartitionSpec( + unresolvedPartSpec.spec, + partSchema.map(_.name), + tableName, + conf.resolver) + val partitionNames = normalizedSpec.keySet + val requestedFields = partSchema.filter(field => partitionNames.contains(field.name)) ResolvedPartitionSpec( - convertToPartIdent(tableName, unresolvedPartSpec.spec, partSchema), + requestedFields.map(_.name), + convertToPartIdent(normalizedSpec, requestedFields), unresolvedPartSpec.location) case resolvedPartitionSpec: ResolvedPartitionSpec => resolvedPartitionSpec } private def convertToPartIdent( - tableName: String, partitionSpec: TablePartitionSpec, - partSchema: StructType): InternalRow = { - val normalizedSpec = normalizePartitionSpec( - partitionSpec, - partSchema.map(_.name), - tableName, - conf.resolver) - - val partValues = partSchema.map { part => - val raw = normalizedSpec.get(part.name).orNull + schema: Seq[StructField]): InternalRow = { + val partValues = schema.map { part => + val raw = partitionSpec.get(part.name).orNull val dt = CharVarcharUtils.replaceCharVarcharWithString(part.dataType) Cast(Literal.create(raw, StringType), dt, Some(conf.sessionLocalTimeZone)).eval() } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala index 95fc4f47dec7f..1518f064d78db 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala @@ -89,7 +89,8 @@ case class ResolvedTable(catalog: TableCatalog, identifier: Identifier, table: T } case class ResolvedPartitionSpec( - spec: InternalRow, + names: Seq[String], + ident: InternalRow, location: Option[String] = None) extends PartitionSpec /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index ce95ea4b41def..ff8b56f0b724b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -3611,9 +3611,12 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg * }}} */ override def visitShowPartitions(ctx: ShowPartitionsContext): LogicalPlan = withOrigin(ctx) { - val table = visitMultipartIdentifier(ctx.multipartIdentifier) - val partitionKeys = Option(ctx.partitionSpec).map(visitNonOptionalPartitionSpec) - ShowPartitionsStatement(table, partitionKeys) + val partitionKeys = Option(ctx.partitionSpec).map { specCtx => + UnresolvedPartitionSpec(visitNonOptionalPartitionSpec(specCtx), None) + } + ShowPartitions( + UnresolvedTable(visitMultipartIdentifier(ctx.multipartIdentifier()), "SHOW PARTITIONS"), + partitionKeys) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala index effb4cff75930..1763547792e35 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala @@ -419,13 +419,6 @@ case class TruncateTableStatement( tableName: Seq[String], partitionSpec: Option[TablePartitionSpec]) extends ParsedStatement -/** - * A SHOW PARTITIONS statement, as parsed from SQL - */ -case class ShowPartitionsStatement( - tableName: Seq[String], - partitionSpec: Option[TablePartitionSpec]) extends ParsedStatement - /** * A SHOW CURRENT NAMESPACE statement, as parsed from SQL */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala index 4931f0eb2c007..67056470418fe 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala @@ -691,3 +691,18 @@ case class TruncateTable( override def children: Seq[LogicalPlan] = child :: Nil } + +/** + * The logical plan of the SHOW PARTITIONS command. + */ +case class ShowPartitions( + child: LogicalPlan, + pattern: Option[PartitionSpec]) extends Command { + override def children: Seq[LogicalPlan] = child :: Nil + + override lazy val resolved: Boolean = + childrenResolved && pattern.forall(_.isInstanceOf[ResolvedPartitionSpec]) + + override val output: Seq[Attribute] = Seq( + AttributeReference("partition", StringType, nullable = false)()) +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index 53edd4fca7794..f6005f4b413a2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -430,11 +430,12 @@ class ResolveSessionCatalog( ident.asTableIdentifier, partitionSpec) - case ShowPartitionsStatement(tbl, partitionSpec) => - val v1TableName = parseV1Table(tbl, "SHOW PARTITIONS") + case ShowPartitions( + ResolvedV1TableOrViewIdentifier(ident), + pattern @ (None | Some(UnresolvedPartitionSpec(_, _)))) => ShowPartitionsCommand( - v1TableName.asTableIdentifier, - partitionSpec) + ident.asTableIdentifier, + pattern.map(_.asInstanceOf[UnresolvedPartitionSpec].spec)) case ShowColumns(ResolvedV1TableOrViewIdentifier(ident), ns) => val v1TableName = ident.asTableIdentifier diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterTableAddPartitionExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterTableAddPartitionExec.scala index 0171cdd9ca41a..d7fe25cff2064 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterTableAddPartitionExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterTableAddPartitionExec.scala @@ -37,20 +37,20 @@ case class AlterTableAddPartitionExec( override protected def run(): Seq[InternalRow] = { val (existsParts, notExistsParts) = - partSpecs.partition(p => table.partitionExists(p.spec)) + partSpecs.partition(p => table.partitionExists(p.ident)) if (existsParts.nonEmpty && !ignoreIfExists) { throw new PartitionsAlreadyExistException( - table.name(), existsParts.map(_.spec), table.partitionSchema()) + table.name(), existsParts.map(_.ident), table.partitionSchema()) } notExistsParts match { case Seq() => // Nothing will be done case Seq(partitionSpec) => val partProp = partitionSpec.location.map(loc => "location" -> loc).toMap - table.createPartition(partitionSpec.spec, partProp.asJava) + table.createPartition(partitionSpec.ident, partProp.asJava) case _ if table.isInstanceOf[SupportsAtomicPartitionManagement] => - val partIdents = notExistsParts.map(_.spec) + val partIdents = notExistsParts.map(_.ident) val partProps = notExistsParts.map(_.location.map(loc => "location" -> loc).toMap) table.asAtomicPartitionable .createPartitions( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterTableDropPartitionExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterTableDropPartitionExec.scala index 09a65804a05eb..c7a68ecb2bbee 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterTableDropPartitionExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterTableDropPartitionExec.scala @@ -35,7 +35,7 @@ case class AlterTableDropPartitionExec( override protected def run(): Seq[InternalRow] = { val (existsPartIdents, notExistsPartIdents) = - partSpecs.map(_.spec).partition(table.partitionExists) + partSpecs.map(_.ident).partition(table.partitionExists) if (notExistsPartIdents.nonEmpty && !ignoreIfNotExists) { throw new NoSuchPartitionsException( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index 1fae8d937e90c..0c7bc19ad054e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.datasources.v2 import scala.collection.JavaConverters._ import org.apache.spark.sql.{AnalysisException, SparkSession, Strategy} -import org.apache.spark.sql.catalyst.analysis.{ResolvedNamespace, ResolvedTable} +import org.apache.spark.sql.catalyst.analysis.{ResolvedNamespace, ResolvedPartitionSpec, ResolvedTable} import org.apache.spark.sql.catalyst.expressions.{And, Expression, NamedExpression, PredicateHelper, SubqueryExpression} import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical._ @@ -318,6 +318,15 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat case ShowColumns(_: ResolvedTable, _) => throw new AnalysisException("SHOW COLUMNS is not supported for v2 tables.") + case r @ ShowPartitions( + ResolvedTable(catalog, _, table: SupportsPartitionManagement), + pattern @ (None | Some(_: ResolvedPartitionSpec))) => + ShowPartitionsExec( + r.output, + catalog, + table, + pattern.map(_.asInstanceOf[ResolvedPartitionSpec])) :: Nil + case _ => Nil } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowPartitionsExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowPartitionsExec.scala new file mode 100644 index 0000000000000..44d6f4495f552 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowPartitionsExec.scala @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.v2 + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.analysis.ResolvedPartitionSpec +import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName +import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal} +import org.apache.spark.sql.connector.catalog.{SupportsPartitionManagement, TableCatalog} +import org.apache.spark.sql.execution.LeafExecNode +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StringType +import org.apache.spark.unsafe.types.UTF8String + +/** + * Physical plan node for showing partitions. + */ +case class ShowPartitionsExec( + output: Seq[Attribute], + catalog: TableCatalog, + table: SupportsPartitionManagement, + partitionSpec: Option[ResolvedPartitionSpec]) extends V2CommandExec with LeafExecNode { + override protected def run(): Seq[InternalRow] = { + val (names, ident) = partitionSpec + .map(spec => (spec.names, spec.ident)) + // listPartitionByNames() should return all partitions if the partition spec + // does not specify any partition names. + .getOrElse((Seq.empty[String], InternalRow.empty)) + val partitionIdentifiers = table.listPartitionByNames(names.toArray, ident) + // Converting partition identifiers as `InternalRow` of partition values, + // for instance InternalRow(value0, value1, ..., valueN), to `InternalRow`s + // with a string in the format: "col0=value0/col1=value1/.../colN=valueN". + val schema = table.partitionSchema() + val len = schema.length + val partitions = new Array[String](len) + val timeZoneId = SQLConf.get.sessionLocalTimeZone + partitionIdentifiers.map { row => + var i = 0 + while (i < len) { + val dataType = schema(i).dataType + val partValue = row.get(i, dataType) + val partValueStr = Cast(Literal(partValue, dataType), StringType, Some(timeZoneId)) + .eval().toString + partitions(i) = escapePathName(schema(i).name) + "=" + escapePathName(partValueStr) + i += 1 + } + InternalRow(UTF8String.fromString(partitions.mkString("/"))) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index ffbc2287d81ad..583bc694dc3be 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -2284,7 +2284,6 @@ class DataSourceV2SQLSuite verify(s"CACHE TABLE $t") verify(s"UNCACHE TABLE $t") verify(s"TRUNCATE TABLE $t") - verify(s"SHOW PARTITIONS $t") verify(s"SHOW COLUMNS FROM $t") } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsParserSuite.scala index bc75528b9644c..7b5cf8af4eead 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsParserSuite.scala @@ -17,25 +17,30 @@ package org.apache.spark.sql.execution.command -import org.apache.spark.sql.catalyst.analysis.AnalysisTest +import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedPartitionSpec, UnresolvedTable} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser.parsePlan import org.apache.spark.sql.catalyst.parser.ParseException -import org.apache.spark.sql.catalyst.plans.logical.ShowPartitionsStatement +import org.apache.spark.sql.catalyst.plans.logical.ShowPartitions import org.apache.spark.sql.execution.SparkSqlParser import org.apache.spark.sql.test.SharedSparkSession class ShowPartitionsParserSuite extends AnalysisTest with SharedSparkSession { test("SHOW PARTITIONS") { + val commandName = "SHOW PARTITIONS" Seq( - "SHOW PARTITIONS t1" -> ShowPartitionsStatement(Seq("t1"), None), - "SHOW PARTITIONS db1.t1" -> ShowPartitionsStatement(Seq("db1", "t1"), None), + "SHOW PARTITIONS t1" -> ShowPartitions(UnresolvedTable(Seq("t1"), commandName), None), + "SHOW PARTITIONS db1.t1" -> ShowPartitions( + UnresolvedTable(Seq("db1", "t1"), commandName), None), "SHOW PARTITIONS t1 PARTITION(partcol1='partvalue', partcol2='partvalue')" -> - ShowPartitionsStatement( - Seq("t1"), - Some(Map("partcol1" -> "partvalue", "partcol2" -> "partvalue"))), - "SHOW PARTITIONS a.b.c" -> ShowPartitionsStatement(Seq("a", "b", "c"), None), + ShowPartitions( + UnresolvedTable(Seq("t1"), commandName), + Some(UnresolvedPartitionSpec(Map("partcol1" -> "partvalue", "partcol2" -> "partvalue")))), + "SHOW PARTITIONS a.b.c" -> ShowPartitions( + UnresolvedTable(Seq("a", "b", "c"), commandName), None), "SHOW PARTITIONS a.b.c PARTITION(ds='2017-06-10')" -> - ShowPartitionsStatement(Seq("a", "b", "c"), Some(Map("ds" -> "2017-06-10"))) + ShowPartitions( + UnresolvedTable(Seq("a", "b", "c"), commandName), + Some(UnresolvedPartitionSpec(Map("ds" -> "2017-06-10")))) ).foreach { case (sql, expected) => val parsed = parsePlan(sql) comparePlans(parsed, expected) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsSuiteBase.scala index 413e170326eea..82457f96a3003 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsSuiteBase.scala @@ -20,17 +20,133 @@ package org.apache.spark.sql.execution.command import org.scalactic.source.Position import org.scalatest.Tag -import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.sql.types.{StringType, StructType} trait ShowPartitionsSuiteBase extends QueryTest with SQLTestUtils { protected def version: String protected def catalog: String - protected def defaultNamespace: Seq[String] protected def defaultUsing: String + protected def wrongPartitionColumnsError(columns: String*): String + // Gets the schema of `SHOW PARTITIONS` + private val showSchema: StructType = new StructType().add("partition", StringType, false) + protected def runShowPartitionsSql(sqlText: String, expected: Seq[Row]): Unit = { + val df = spark.sql(sqlText) + assert(df.schema === showSchema) + checkAnswer(df, expected) + } override def test(testName: String, testTags: Tag*)(testFun: => Any) (implicit pos: Position): Unit = { super.test(s"SHOW PARTITIONS $version: " + testName, testTags: _*)(testFun) } + + protected def createDateTable(table: String): Unit = { + sql(s""" + |CREATE TABLE $table (price int, qty int, year int, month int) + |$defaultUsing + |partitioned by (year, month)""".stripMargin) + sql(s"INSERT INTO $table PARTITION(year = 2015, month = 1) SELECT 1, 1") + sql(s"INSERT INTO $table PARTITION(year = 2015, month = 2) SELECT 2, 2") + sql(s"ALTER TABLE $table ADD PARTITION(year = 2016, month = 2)") + sql(s"ALTER TABLE $table ADD PARTITION(year = 2016, month = 3)") + } + + protected def createWideTable(table: String): Unit = { + sql(s""" + |CREATE TABLE $table ( + | price int, qty int, + | year int, month int, hour int, minute int, sec int, extra int) + |$defaultUsing + |PARTITIONED BY (year, month, hour, minute, sec, extra) + |""".stripMargin) + sql(s""" + |INSERT INTO $table + |PARTITION(year = 2016, month = 3, hour = 10, minute = 10, sec = 10, extra = 1) SELECT 3, 3 + |""".stripMargin) + sql(s""" + |ALTER TABLE $table + |ADD PARTITION(year = 2016, month = 4, hour = 10, minute = 10, sec = 10, extra = 1) + |""".stripMargin) + } + + test("show partitions of non-partitioned table") { + withNamespace(s"$catalog.ns") { + sql(s"CREATE NAMESPACE $catalog.ns") + val table = s"$catalog.ns.not_partitioned_table" + withTable(table) { + sql(s"CREATE TABLE $table (col1 int) $defaultUsing") + val errMsg = intercept[AnalysisException] { + sql(s"SHOW PARTITIONS $table") + }.getMessage + assert(errMsg.contains("not allowed on a table that is not partitioned")) + } + } + } + + test("non-partitioning columns") { + withNamespace(s"$catalog.ns") { + sql(s"CREATE NAMESPACE $catalog.ns") + val table = s"$catalog.ns.dateTable" + withTable(table) { + createDateTable(table) + val errMsg = intercept[AnalysisException] { + sql(s"SHOW PARTITIONS $table PARTITION(abcd=2015, xyz=1)") + }.getMessage + assert(errMsg.contains(wrongPartitionColumnsError("abcd", "xyz"))) + } + } + } + + test("show everything") { + withNamespace(s"$catalog.ns") { + sql(s"CREATE NAMESPACE $catalog.ns") + val table = s"$catalog.ns.dateTable" + withTable(table) { + createDateTable(table) + runShowPartitionsSql( + s"show partitions $table", + Row("year=2015/month=1") :: + Row("year=2015/month=2") :: + Row("year=2016/month=2") :: + Row("year=2016/month=3") :: Nil) + } + } + } + + test("filter by partitions") { + withNamespace(s"$catalog.ns") { + sql(s"CREATE NAMESPACE $catalog.ns") + val table = s"$catalog.ns.dateTable" + withTable(table) { + createDateTable(table) + runShowPartitionsSql( + s"show partitions $table PARTITION(year=2015)", + Row("year=2015/month=1") :: + Row("year=2015/month=2") :: Nil) + runShowPartitionsSql( + s"show partitions $table PARTITION(year=2015, month=1)", + Row("year=2015/month=1") :: Nil) + runShowPartitionsSql( + s"show partitions $table PARTITION(month=2)", + Row("year=2015/month=2") :: + Row("year=2016/month=2") :: Nil) + } + } + } + + test("show everything more than 5 part keys") { + withNamespace(s"$catalog.ns") { + sql(s"CREATE NAMESPACE $catalog.ns") + val table = s"$catalog.ns.wideTable" + withTable(table) { + createWideTable(table) + runShowPartitionsSql( + s"show partitions $table", + Row("year=2016/month=3/hour=10/minute=10/sec=10/extra=1") :: + Row("year=2016/month=4/hour=10/minute=10/sec=10/extra=1") :: Nil) + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowPartitionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowPartitionsSuite.scala index bcc71e9b7241c..2b2bc9e63dc82 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowPartitionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowPartitionsSuite.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.execution.command.v1 import org.apache.spark.sql.{AnalysisException, Row, SaveMode} -import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.sql.connector.catalog.CatalogManager import org.apache.spark.sql.execution.command import org.apache.spark.sql.test.SharedSparkSession @@ -26,104 +25,27 @@ import org.apache.spark.sql.test.SharedSparkSession trait ShowPartitionsSuiteBase extends command.ShowPartitionsSuiteBase { override def version: String = "V1" override def catalog: String = CatalogManager.SESSION_CATALOG_NAME - override def defaultNamespace: Seq[String] = Seq("default") override def defaultUsing: String = "USING parquet" - private def createDateTable(table: String): Unit = { - sql(s""" - |CREATE TABLE $table (price int, qty int, year int, month int) - |$defaultUsing - |partitioned by (year, month)""".stripMargin) - sql(s"INSERT INTO $table PARTITION(year = 2015, month = 1) SELECT 1, 1") - sql(s"INSERT INTO $table PARTITION(year = 2015, month = 2) SELECT 2, 2") - sql(s"INSERT INTO $table PARTITION(year = 2016, month = 2) SELECT 3, 3") - sql(s"INSERT INTO $table PARTITION(year = 2016, month = 3) SELECT 3, 3") + override protected def wrongPartitionColumnsError(columns: String*): String = { + s"Non-partitioning column(s) ${columns.mkString("[", ", ", "]")} are specified" } - test("show everything") { + test("show everything in the default database") { val table = "dateTable" withTable(table) { createDateTable(table) - checkAnswer( - sql(s"show partitions $table"), + runShowPartitionsSql( + s"show partitions default.$table", Row("year=2015/month=1") :: - Row("year=2015/month=2") :: - Row("year=2016/month=2") :: - Row("year=2016/month=3") :: Nil) - - checkAnswer( - sql(s"show partitions default.$table"), - Row("year=2015/month=1") :: - Row("year=2015/month=2") :: - Row("year=2016/month=2") :: - Row("year=2016/month=3") :: Nil) - } - } - - test("filter by partitions") { - val table = "dateTable" - withTable(table) { - createDateTable(table) - checkAnswer( - sql(s"show partitions default.$table PARTITION(year=2015)"), - Row("year=2015/month=1") :: - Row("year=2015/month=2") :: Nil) - checkAnswer( - sql(s"show partitions default.$table PARTITION(year=2015, month=1)"), - Row("year=2015/month=1") :: Nil) - checkAnswer( - sql(s"show partitions default.$table PARTITION(month=2)"), Row("year=2015/month=2") :: - Row("year=2016/month=2") :: Nil) - } - } - - test("show everything more than 5 part keys") { - val table = "wideTable" - withTable(table) { - sql(s""" - |CREATE TABLE $table ( - | price int, qty int, - | year int, month int, hour int, minute int, sec int, extra int) - |$defaultUsing - |PARTITIONED BY (year, month, hour, minute, sec, extra)""".stripMargin) - sql(s""" - |INSERT INTO $table - |PARTITION(year = 2016, month = 3, hour = 10, minute = 10, sec = 10, extra = 1) SELECT 3, 3 - """.stripMargin) - sql(s""" - |INSERT INTO $table - |PARTITION(year = 2016, month = 4, hour = 10, minute = 10, sec = 10, extra = 1) SELECT 3, 3 - """.stripMargin) - checkAnswer( - sql(s"show partitions $table"), - Row("year=2016/month=3/hour=10/minute=10/sec=10/extra=1") :: - Row("year=2016/month=4/hour=10/minute=10/sec=10/extra=1") :: Nil) - } - } - - test("non-partitioning columns") { - val table = "dateTable" - withTable(table) { - createDateTable(table) - val errMsg = intercept[AnalysisException] { - sql(s"SHOW PARTITIONS $table PARTITION(abcd=2015, xyz=1)") - }.getMessage - assert(errMsg.contains("Non-partitioning column(s) [abcd, xyz] are specified")) - } - } - - test("show partitions of non-partitioned table") { - val table = "not_partitioned_table" - withTable(table) { - sql(s"CREATE TABLE $table (col1 int) $defaultUsing") - val errMsg = intercept[AnalysisException] { - sql(s"SHOW PARTITIONS $table") - }.getMessage - assert(errMsg.contains("not allowed on a table that is not partitioned")) + Row("year=2016/month=2") :: + Row("year=2016/month=3") :: Nil) } } + // The test fails for V2 Table Catalogs with the exception: + // org.apache.spark.sql.AnalysisException: CREATE VIEW is only supported with v1 tables. test("show partitions of a view") { val table = "dateTable" withTable(table) { @@ -134,7 +56,7 @@ trait ShowPartitionsSuiteBase extends command.ShowPartitionsSuiteBase { val errMsg = intercept[AnalysisException] { sql(s"SHOW PARTITIONS $view") }.getMessage - assert(errMsg.contains("is not allowed on a view")) + assert(errMsg.contains("'SHOW PARTITIONS' expects a table")) } } } @@ -143,10 +65,10 @@ trait ShowPartitionsSuiteBase extends command.ShowPartitionsSuiteBase { val viewName = "test_view" withTempView(viewName) { spark.range(10).createTempView(viewName) - val errMsg = intercept[NoSuchTableException] { + val errMsg = intercept[AnalysisException] { sql(s"SHOW PARTITIONS $viewName") }.getMessage - assert(errMsg.contains(s"Table or view '$viewName' not found")) + assert(errMsg.contains("'SHOW PARTITIONS' expects a table")) } } } @@ -159,12 +81,12 @@ class ShowPartitionsSuite extends ShowPartitionsSuiteBase with SharedSparkSessio val viewName = "test_view" withTempView(viewName) { sql(s""" - |CREATE TEMPORARY VIEW $viewName (c1 INT, c2 STRING) - |$defaultUsing""".stripMargin) - val errMsg = intercept[NoSuchTableException] { + |CREATE TEMPORARY VIEW $viewName (c1 INT, c2 STRING) + |$defaultUsing""".stripMargin) + val errMsg = intercept[AnalysisException] { sql(s"SHOW PARTITIONS $viewName") }.getMessage - assert(errMsg.contains(s"Table or view '$viewName' not found")) + assert(errMsg.contains("'SHOW PARTITIONS' expects a table")) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowPartitionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowPartitionsSuite.scala index 8a63cd49e89e9..ca47a713ad604 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowPartitionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowPartitionsSuite.scala @@ -19,38 +19,34 @@ package org.apache.spark.sql.execution.command.v2 import org.apache.spark.SparkConf import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.connector.InMemoryTableCatalog +import org.apache.spark.sql.connector.{InMemoryPartitionTableCatalog, InMemoryTableCatalog} import org.apache.spark.sql.execution.command import org.apache.spark.sql.test.SharedSparkSession class ShowPartitionsSuite extends command.ShowPartitionsSuiteBase with SharedSparkSession { override def version: String = "V2" override def catalog: String = "test_catalog" - override def defaultNamespace: Seq[String] = Nil override def defaultUsing: String = "USING _" override def sparkConf: SparkConf = super.sparkConf - .set(s"spark.sql.catalog.$catalog", classOf[InMemoryTableCatalog].getName) + .set(s"spark.sql.catalog.$catalog", classOf[InMemoryPartitionTableCatalog].getName) + .set(s"spark.sql.catalog.non_part_$catalog", classOf[InMemoryTableCatalog].getName) - // TODO(SPARK-33452): Create a V2 SHOW PARTITIONS execution node - test("not supported SHOW PARTITIONS") { - def testV1Command(sqlCommand: String, sqlParams: String): Unit = { - val e = intercept[AnalysisException] { - sql(s"$sqlCommand $sqlParams") - } - assert(e.message.contains(s"$sqlCommand is only supported with v1 tables")) - } - val t = s"$catalog.ns1.ns2.tbl" - withTable(t) { - sql( - s""" - |CREATE TABLE $t (id bigint, data string) - |$defaultUsing - |PARTITIONED BY (id) - """.stripMargin) + override protected def wrongPartitionColumnsError(columns: String*): String = { + s"${columns.head} is not a valid partition column" + } - testV1Command("SHOW PARTITIONS", t) - testV1Command("SHOW PARTITIONS", s"$t PARTITION(id='1')") + test("a table does not support partitioning") { + val table = s"non_part_$catalog.tab1" + withTable(table) { + sql(s""" + |CREATE TABLE $table (price int, qty int, year int, month int) + |$defaultUsing""".stripMargin) + val errMsg = intercept[AnalysisException] { + sql(s"SHOW PARTITIONS $table") + }.getMessage + assert(errMsg.contains( + "SHOW PARTITIONS cannot run for a table which does not support partitioning")) } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala index 3af163af0968c..49e26614e13c4 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala @@ -300,7 +300,7 @@ class PartitionedTablePerfStatsSuite HiveCatalogMetrics.reset() assert(spark.sql("show partitions test").count() == 100) - assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10) + assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() <= 10) } } } @@ -323,7 +323,7 @@ class PartitionedTablePerfStatsSuite HiveCatalogMetrics.reset() assert(spark.sql("show partitions test").count() == 100) - assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10) + assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() <= 10) } } } From 6fd148fea890391941f876e0a14446d875fe72e1 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Mon, 30 Nov 2020 14:05:49 +0000 Subject: [PATCH 050/150] [SPARK-33569][SQL] Remove getting partitions by an identifier prefix ### What changes were proposed in this pull request? 1. Remove the method `listPartitionIdentifiers()` from the `SupportsPartitionManagement` interface. The method lists partitions by ident prefix. 2. Rename `listPartitionByNames()` to `listPartitionIdentifiers()`. 3. Re-implement the default method `partitionExists()` using new method. ### Why are the changes needed? Getting partitions by ident prefix only is not used, and it can be removed to improve code maintenance. Also this makes the `SupportsPartitionManagement` interface cleaner. ### Does this PR introduce _any_ user-facing change? Should not. ### How was this patch tested? By running the affected test suites: ``` $ build/sbt "test:testOnly org.apache.spark.sql.connector.catalog.*" ``` Closes #30514 from MaxGekk/remove-listPartitionIdentifiers. Authored-by: Max Gekk Signed-off-by: Wenchen Fan --- .../catalog/SupportsPartitionManagement.java | 15 ++---- .../connector/InMemoryPartitionTable.scala | 10 +--- ...pportsAtomicPartitionManagementSuite.scala | 28 ++++++----- .../SupportsPartitionManagementSuite.scala | 48 ++++++++++--------- .../AlterTablePartitionV2SQLSuite.scala | 6 ++- 5 files changed, 52 insertions(+), 55 deletions(-) diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsPartitionManagement.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsPartitionManagement.java index 380717d2e0e9b..9d898f2f477e1 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsPartitionManagement.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsPartitionManagement.java @@ -17,6 +17,7 @@ package org.apache.spark.sql.connector.catalog; +import java.util.Arrays; import java.util.Map; import org.apache.spark.annotation.Experimental; @@ -79,7 +80,9 @@ void createPartition( * @return true if the partition exists, false otherwise */ default boolean partitionExists(InternalRow ident) { - return listPartitionIdentifiers(ident).length > 0; + String[] partitionNames = partitionSchema().names(); + String[] requiredNames = Arrays.copyOfRange(partitionNames, 0, ident.numFields()); + return listPartitionIdentifiers(requiredNames, ident).length > 0; } /** @@ -105,14 +108,6 @@ void replacePartitionMetadata( Map loadPartitionMetadata(InternalRow ident) throws UnsupportedOperationException; - /** - * List the identifiers of all partitions that have the ident prefix in a table. - * - * @param ident a prefix of partition identifier - * @return an array of Identifiers for the partitions - */ - InternalRow[] listPartitionIdentifiers(InternalRow ident); - /** * List the identifiers of all partitions that match to the ident by names. * @@ -120,5 +115,5 @@ Map loadPartitionMetadata(InternalRow ident) * @param ident a partition identifier values. * @return an array of Identifiers for the partitions */ - InternalRow[] listPartitionByNames(String[] names, InternalRow ident); + InternalRow[] listPartitionIdentifiers(String[] names, InternalRow ident); } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryPartitionTable.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryPartitionTable.scala index ba762a58b1e52..6a8432e635310 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryPartitionTable.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryPartitionTable.scala @@ -83,14 +83,6 @@ class InMemoryPartitionTable( } } - def listPartitionIdentifiers(ident: InternalRow): Array[InternalRow] = { - val prefixPartCols = - new StructType(partitionSchema.dropRight(partitionSchema.length - ident.numFields).toArray) - val prefixPart = ident.toSeq(prefixPartCols) - memoryTablePartitions.keySet().asScala - .filter(_.toSeq(partitionSchema).startsWith(prefixPart)).toArray - } - override def partitionExists(ident: InternalRow): Boolean = memoryTablePartitions.containsKey(ident) @@ -98,7 +90,7 @@ class InMemoryPartitionTable( memoryTablePartitions.put(InternalRow.fromSeq(key), Map.empty[String, String].asJava) } - override def listPartitionByNames( + override def listPartitionIdentifiers( names: Array[String], ident: InternalRow): Array[InternalRow] = { assert(names.length == ident.numFields, diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/SupportsAtomicPartitionManagementSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/SupportsAtomicPartitionManagementSuite.scala index 6f7c30653110b..ad2631650b7ef 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/SupportsAtomicPartitionManagementSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/SupportsAtomicPartitionManagementSuite.scala @@ -47,34 +47,38 @@ class SupportsAtomicPartitionManagementSuite extends SparkFunSuite { newCatalog } + private def hasPartitions(table: SupportsPartitionManagement): Boolean = { + !table.listPartitionIdentifiers(Array.empty, InternalRow.empty).isEmpty + } + test("createPartitions") { val table = catalog.loadTable(ident) val partTable = new InMemoryAtomicPartitionTable( table.name(), table.schema(), table.partitioning(), table.properties()) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) val partIdents = Array(InternalRow.apply("3"), InternalRow.apply("4")) partTable.createPartitions( partIdents, Array(new util.HashMap[String, String](), new util.HashMap[String, String]())) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).nonEmpty) + assert(hasPartitions(partTable)) assert(partTable.partitionExists(InternalRow.apply("3"))) assert(partTable.partitionExists(InternalRow.apply("4"))) partTable.dropPartition(InternalRow.apply("3")) partTable.dropPartition(InternalRow.apply("4")) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) } test("createPartitions failed if partition already exists") { val table = catalog.loadTable(ident) val partTable = new InMemoryAtomicPartitionTable( table.name(), table.schema(), table.partitioning(), table.properties()) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) val partIdent = InternalRow.apply("4") partTable.createPartition(partIdent, new util.HashMap[String, String]()) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).nonEmpty) + assert(hasPartitions(partTable)) assert(partTable.partitionExists(partIdent)) val partIdents = Array(InternalRow.apply("3"), InternalRow.apply("4")) @@ -85,42 +89,42 @@ class SupportsAtomicPartitionManagementSuite extends SparkFunSuite { assert(!partTable.partitionExists(InternalRow.apply("3"))) partTable.dropPartition(partIdent) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) } test("dropPartitions") { val table = catalog.loadTable(ident) val partTable = new InMemoryAtomicPartitionTable( table.name(), table.schema(), table.partitioning(), table.properties()) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) val partIdents = Array(InternalRow.apply("3"), InternalRow.apply("4")) partTable.createPartitions( partIdents, Array(new util.HashMap[String, String](), new util.HashMap[String, String]())) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).nonEmpty) + assert(hasPartitions(partTable)) assert(partTable.partitionExists(InternalRow.apply("3"))) assert(partTable.partitionExists(InternalRow.apply("4"))) partTable.dropPartitions(partIdents) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) } test("dropPartitions failed if partition not exists") { val table = catalog.loadTable(ident) val partTable = new InMemoryAtomicPartitionTable( table.name(), table.schema(), table.partitioning(), table.properties()) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) val partIdent = InternalRow.apply("4") partTable.createPartition(partIdent, new util.HashMap[String, String]()) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).length == 1) + assert(partTable.listPartitionIdentifiers(Array.empty, InternalRow.empty).length == 1) val partIdents = Array(InternalRow.apply("3"), InternalRow.apply("4")) assert(!partTable.dropPartitions(partIdents)) assert(partTable.partitionExists(partIdent)) partTable.dropPartition(partIdent) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/SupportsPartitionManagementSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/SupportsPartitionManagementSuite.scala index caf7e91612563..9de0fe6108c99 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/SupportsPartitionManagementSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/SupportsPartitionManagementSuite.scala @@ -48,97 +48,101 @@ class SupportsPartitionManagementSuite extends SparkFunSuite { newCatalog } + private def hasPartitions(table: SupportsPartitionManagement): Boolean = { + !table.listPartitionIdentifiers(Array.empty, InternalRow.empty).isEmpty + } + test("createPartition") { val table = catalog.loadTable(ident) val partTable = new InMemoryPartitionTable( table.name(), table.schema(), table.partitioning(), table.properties()) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) val partIdent = InternalRow.apply("3") partTable.createPartition(partIdent, new util.HashMap[String, String]()) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).nonEmpty) + assert(hasPartitions(partTable)) assert(partTable.partitionExists(partIdent)) partTable.dropPartition(partIdent) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) } test("dropPartition") { val table = catalog.loadTable(ident) val partTable = new InMemoryPartitionTable( table.name(), table.schema(), table.partitioning(), table.properties()) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) val partIdent = InternalRow.apply("3") val partIdent1 = InternalRow.apply("4") partTable.createPartition(partIdent, new util.HashMap[String, String]()) partTable.createPartition(partIdent1, new util.HashMap[String, String]()) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).length == 2) + assert(partTable.listPartitionIdentifiers(Array.empty, InternalRow.empty).length == 2) partTable.dropPartition(partIdent) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).length == 1) + assert(partTable.listPartitionIdentifiers(Array.empty, InternalRow.empty).length == 1) partTable.dropPartition(partIdent1) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) } test("replacePartitionMetadata") { val table = catalog.loadTable(ident) val partTable = new InMemoryPartitionTable( table.name(), table.schema(), table.partitioning(), table.properties()) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) val partIdent = InternalRow.apply("3") partTable.createPartition(partIdent, new util.HashMap[String, String]()) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).nonEmpty) + assert(hasPartitions(partTable)) assert(partTable.partitionExists(partIdent)) assert(partTable.loadPartitionMetadata(partIdent).isEmpty) partTable.replacePartitionMetadata(partIdent, Map("paramKey" -> "paramValue").asJava) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).nonEmpty) + assert(hasPartitions(partTable)) assert(partTable.partitionExists(partIdent)) assert(!partTable.loadPartitionMetadata(partIdent).isEmpty) assert(partTable.loadPartitionMetadata(partIdent).get("paramKey") == "paramValue") partTable.dropPartition(partIdent) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) } test("loadPartitionMetadata") { val table = catalog.loadTable(ident) val partTable = new InMemoryPartitionTable( table.name(), table.schema(), table.partitioning(), table.properties()) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) val partIdent = InternalRow.apply("3") partTable.createPartition(partIdent, Map("paramKey" -> "paramValue").asJava) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).nonEmpty) + assert(hasPartitions(partTable)) assert(partTable.partitionExists(partIdent)) assert(!partTable.loadPartitionMetadata(partIdent).isEmpty) assert(partTable.loadPartitionMetadata(partIdent).get("paramKey") == "paramValue") partTable.dropPartition(partIdent) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) } test("listPartitionIdentifiers") { val table = catalog.loadTable(ident) val partTable = new InMemoryPartitionTable( table.name(), table.schema(), table.partitioning(), table.properties()) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) val partIdent = InternalRow.apply("3") partTable.createPartition(partIdent, new util.HashMap[String, String]()) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).length == 1) + assert(partTable.listPartitionIdentifiers(Array.empty, InternalRow.empty).length == 1) val partIdent1 = InternalRow.apply("4") partTable.createPartition(partIdent1, new util.HashMap[String, String]()) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).length == 2) - assert(partTable.listPartitionIdentifiers(partIdent1).length == 1) + assert(partTable.listPartitionIdentifiers(Array.empty, InternalRow.empty).length == 2) + assert(partTable.listPartitionIdentifiers(Array("dt"), partIdent1).length == 1) partTable.dropPartition(partIdent) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).length == 1) + assert(partTable.listPartitionIdentifiers(Array.empty, InternalRow.empty).length == 1) partTable.dropPartition(partIdent1) - assert(partTable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert(!hasPartitions(partTable)) } test("listPartitionByNames") { @@ -170,7 +174,7 @@ class SupportsPartitionManagementSuite extends SparkFunSuite { (Array("part0", "part1"), InternalRow(3, "xyz")) -> Set(), (Array("part1"), InternalRow(3.14f)) -> Set() ).foreach { case ((names, idents), expected) => - assert(partTable.listPartitionByNames(names, idents).toSet === expected) + assert(partTable.listPartitionIdentifiers(names, idents).toSet === expected) } // Check invalid parameters Seq( @@ -178,7 +182,7 @@ class SupportsPartitionManagementSuite extends SparkFunSuite { (Array("col0", "part1"), InternalRow(0, 1)), (Array("wrong"), InternalRow("invalid")) ).foreach { case (names, idents) => - intercept[AssertionError](partTable.listPartitionByNames(names, idents)) + intercept[AssertionError](partTable.listPartitionIdentifiers(names, idents)) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTablePartitionV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTablePartitionV2SQLSuite.scala index 4cacd5ec2b49e..3583eceec7559 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTablePartitionV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTablePartitionV2SQLSuite.scala @@ -141,7 +141,8 @@ class AlterTablePartitionV2SQLSuite extends DatasourceV2SQLBase { catalog("testpart").asTableCatalog.loadTable(Identifier.of(Array("ns1", "ns2"), "tbl")) assert(!partTable.asPartitionable.partitionExists(InternalRow.fromSeq(Seq(1)))) assert(!partTable.asPartitionable.partitionExists(InternalRow.fromSeq(Seq(2)))) - assert(partTable.asPartitionable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert( + partTable.asPartitionable.listPartitionIdentifiers(Array.empty, InternalRow.empty).isEmpty) } } @@ -161,7 +162,8 @@ class AlterTablePartitionV2SQLSuite extends DatasourceV2SQLBase { spark.sql(s"ALTER TABLE $t DROP IF EXISTS PARTITION (id=1), PARTITION (id=2)") assert(!partTable.asPartitionable.partitionExists(InternalRow.fromSeq(Seq(1)))) assert(!partTable.asPartitionable.partitionExists(InternalRow.fromSeq(Seq(2)))) - assert(partTable.asPartitionable.listPartitionIdentifiers(InternalRow.empty).isEmpty) + assert( + partTable.asPartitionable.listPartitionIdentifiers(Array.empty, InternalRow.empty).isEmpty) } } From 030b3139dadc342e82d71f3fb241c320a7577131 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Mon, 30 Nov 2020 16:40:36 +0000 Subject: [PATCH 051/150] [SPARK-33569][SPARK-33452][SQL][FOLLOWUP] Fix a build error in `ShowPartitionsExec` ### What changes were proposed in this pull request? Use `listPartitionIdentifiers ` instead of `listPartitionByNames` in `ShowPartitionsExec`. The `listPartitionByNames` was renamed by https://github.com/apache/spark/pull/30514. ### Why are the changes needed? To fix build error. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? By running tests for the `SHOW PARTITIONS` command: ``` $ build/sbt -Phive-2.3 -Phive-thriftserver "test:testOnly *ShowPartitionsSuite" ``` Closes #30553 from MaxGekk/fix-build-show-partitions-exec. Authored-by: Max Gekk Signed-off-by: Wenchen Fan --- .../spark/sql/execution/datasources/v2/ShowPartitionsExec.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowPartitionsExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowPartitionsExec.scala index 44d6f4495f552..c4b6aa805d58f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowPartitionsExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowPartitionsExec.scala @@ -41,7 +41,7 @@ case class ShowPartitionsExec( // listPartitionByNames() should return all partitions if the partition spec // does not specify any partition names. .getOrElse((Seq.empty[String], InternalRow.empty)) - val partitionIdentifiers = table.listPartitionByNames(names.toArray, ident) + val partitionIdentifiers = table.listPartitionIdentifiers(names.toArray, ident) // Converting partition identifiers as `InternalRow` of partition values, // for instance InternalRow(value0, value1, ..., valueN), to `InternalRow`s // with a string in the format: "col0=value0/col1=value1/.../colN=valueN". From f3c2583cc3ad6a2a24bfb09e2ee7af4e63e5bf66 Mon Sep 17 00:00:00 2001 From: Erik Krogen Date: Mon, 30 Nov 2020 14:40:51 -0600 Subject: [PATCH 052/150] [SPARK-33185][YARN][FOLLOW-ON] Leverage RM's RPC API instead of REST to fetch driver log links in yarn.Client ### What changes were proposed in this pull request? This is a follow-on to PR #30096 which initially added support for printing direct links to the driver stdout/stderr logs from the application report output in `yarn.Client` using the `spark.yarn.includeDriverLogsLink` configuration. That PR made use of the ResourceManager's REST APIs to fetch the necessary information to construct the links. This PR proposes removing the dependency on the REST API, since the new logic is the only place in `yarn.Client` which makes use of this API, and instead leverages the RPC API via `YarnClient`, which brings the code in line with the rest of `yarn.Client`. ### Why are the changes needed? While the old logic worked okay when running a Spark application in a "standard" environment with full access to Kerberos credentials, it can fail when run in an environment with restricted Kerberos credentials. In our case, this environment is represented by [Azkaban](https://azkaban.github.io/), but it likely affects other job scheduling systems as well. In such an environment, the application has delegation tokens which enabled it to communicate with services such as YARN, but the RM REST API is not typically covered by such delegation tokens (note that although YARN does actually support accessing the RM REST API via a delegation token as documented [here](https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/ResourceManagerRest.html#Cluster_Delegation_Tokens_API), it is a new feature in alpha phase, and most deployments are likely not retrieving this token today). Besides this enhancement, leveraging the `YarnClient` APIs greatly simplifies the processing logic, such as removing all JSON parsing. ### Does this PR introduce _any_ user-facing change? Very minimal user-facing changes on top of PR #30096. Basically expands the scope of environments in which that feature will operate correctly. ### How was this patch tested? In addition to redoing the `spark-submit` testing as mentioned in PR #30096, I also tested this logic in a restricted-credentials environment (Azkaban). It succeeds where the previous logic would fail with a 401 error. Closes #30450 from xkrogen/xkrogen-SPARK-33185-driverlogs-followon. Authored-by: Erik Krogen Signed-off-by: Mridul Muralidharan gmail.com> --- .../org/apache/spark/deploy/yarn/Client.scala | 67 +++++++------------ .../spark/deploy/yarn/ClientSuite.scala | 47 ------------- .../spark/deploy/yarn/YarnClusterSuite.scala | 31 +++++++++ 3 files changed, 54 insertions(+), 91 deletions(-) diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index 552167c935b30..d252e8368a0c4 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -29,12 +29,8 @@ import scala.collection.immutable.{Map => IMap} import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, ListBuffer, Map} import scala.util.control.NonFatal -import com.fasterxml.jackson.databind.ObjectMapper import com.google.common.base.Objects import com.google.common.io.Files -import javax.ws.rs.client.ClientBuilder -import javax.ws.rs.core.MediaType -import javax.ws.rs.core.Response.Status.Family import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.hadoop.fs.permission.FsPermission @@ -51,7 +47,6 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.hadoop.yarn.exceptions.ApplicationNotFoundException import org.apache.hadoop.yarn.security.AMRMTokenIdentifier import org.apache.hadoop.yarn.util.Records -import org.apache.hadoop.yarn.webapp.util.WebAppUtils import org.apache.spark.{SecurityManager, SparkConf, SparkException} import org.apache.spark.api.python.PythonUtils @@ -1089,9 +1084,9 @@ private[spark] class Client( // If DEBUG is enabled, log report details every iteration // Otherwise, log them every time the application changes state if (log.isDebugEnabled) { - logDebug(formatReportDetails(report, getDriverLogsLink(report.getApplicationId))) + logDebug(formatReportDetails(report, getDriverLogsLink(report))) } else if (lastState != state) { - logInfo(formatReportDetails(report, getDriverLogsLink(report.getApplicationId))) + logInfo(formatReportDetails(report, getDriverLogsLink(report))) } } @@ -1192,33 +1187,31 @@ private[spark] class Client( } /** - * Fetch links to the logs of the driver for the given application ID. This requires hitting the - * RM REST API. Returns an empty map if the links could not be fetched. If this feature is - * disabled via [[CLIENT_INCLUDE_DRIVER_LOGS_LINK]], an empty map is returned immediately. + * Fetch links to the logs of the driver for the given application report. This requires + * query the ResourceManager via RPC. Returns an empty map if the links could not be fetched. + * If this feature is disabled via [[CLIENT_INCLUDE_DRIVER_LOGS_LINK]], or if the application + * report indicates that the driver container isn't currently running, an empty map is + * returned immediately. */ - private def getDriverLogsLink(appId: ApplicationId): IMap[String, String] = { - if (!sparkConf.get(CLIENT_INCLUDE_DRIVER_LOGS_LINK)) { - return IMap() + private def getDriverLogsLink(appReport: ApplicationReport): IMap[String, String] = { + if (!sparkConf.get(CLIENT_INCLUDE_DRIVER_LOGS_LINK) + || appReport.getYarnApplicationState != YarnApplicationState.RUNNING) { + return IMap.empty } try { - val baseRmUrl = WebAppUtils.getRMWebAppURLWithScheme(hadoopConf) - val response = ClientBuilder.newClient() - .target(baseRmUrl) - .path("ws").path("v1").path("cluster").path("apps") - .path(appId.toString).path("appattempts") - .request(MediaType.APPLICATION_JSON) - .get() - response.getStatusInfo.getFamily match { - case Family.SUCCESSFUL => parseAppAttemptsJsonResponse(response.readEntity(classOf[String])) - case _ => - logWarning(s"Unable to fetch app attempts info from $baseRmUrl, got " - + s"status code ${response.getStatus}: ${response.getStatusInfo.getReasonPhrase}") - IMap() - } + Option(appReport.getCurrentApplicationAttemptId) + .flatMap(attemptId => Option(yarnClient.getApplicationAttemptReport(attemptId))) + .flatMap(attemptReport => Option(attemptReport.getAMContainerId)) + .flatMap(amContainerId => Option(yarnClient.getContainerReport(amContainerId))) + .flatMap(containerReport => Option(containerReport.getLogUrl)) + .map(YarnContainerInfoHelper.getLogUrlsFromBaseUrl) + .getOrElse(IMap.empty) } catch { case e: Exception => - logWarning(s"Unable to get driver log links for $appId", e) - IMap() + logWarning(s"Unable to get driver log links for $appId: $e") + // Include the full stack trace only at DEBUG level to reduce verbosity + logDebug(s"Unable to get driver log links for $appId", e) + IMap.empty } } @@ -1236,7 +1229,7 @@ private[spark] class Client( val report = getApplicationReport(appId) val state = report.getYarnApplicationState logInfo(s"Application report for $appId (state: $state)") - logInfo(formatReportDetails(report, getDriverLogsLink(report.getApplicationId))) + logInfo(formatReportDetails(report, getDriverLogsLink(report))) if (state == YarnApplicationState.FAILED || state == YarnApplicationState.KILLED) { throw new SparkException(s"Application $appId finished with status: $state") } @@ -1627,20 +1620,6 @@ private object Client extends Logging { writer.flush() out.closeEntry() } - - private[yarn] def parseAppAttemptsJsonResponse(jsonString: String): IMap[String, String] = { - val objectMapper = new ObjectMapper() - // If JSON response is malformed somewhere along the way, MissingNode will be returned, - // which allows for safe continuation of chaining. The `elements()` call will be empty, - // and None will get returned. - objectMapper.readTree(jsonString) - .path("appAttempts").path("appAttempt") - .elements().asScala.toList.takeRight(1).headOption - .map(_.path("logsLink").asText("")) - .filterNot(_ == "") - .map(baseUrl => YarnContainerInfoHelper.getLogUrlsFromBaseUrl(baseUrl)) - .getOrElse(IMap()) - } } private[spark] class YarnClusterApplication extends SparkApplication { diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala index fccb2406d66f8..ea3acec3bb78b 100644 --- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala +++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala @@ -583,53 +583,6 @@ class ClientSuite extends SparkFunSuite with Matchers { } } - test("SPARK-33185 Parse YARN AppAttempts valid JSON response") { - val appIdSuffix = "1500000000000_1234567" - val containerId = s"container_e1_${appIdSuffix}_01_000001" - val nodeHost = "node.example.com" - val jsonString = - s""" - |{"appAttempts": { - | "appAttempt": [ { - | "id":1, - | "startTime":1600000000000, - | "finishedTime":1600000100000, - | "containerId":"$containerId", - | "nodeHttpAddress":"$nodeHost:8042", - | "nodeId":"node.example.com:8041", - | "logsLink":"http://$nodeHost:8042/node/containerlogs/$containerId/username", - | "blacklistedNodes":"", - | "nodesBlacklistedBySystem":"", - | "appAttemptId":"appattempt_${appIdSuffix}_000001" - | }] - |}} - |""".stripMargin - val logLinkMap = Client.parseAppAttemptsJsonResponse(jsonString) - assert(logLinkMap.keySet === Set("stdout", "stderr")) - assert(logLinkMap("stdout") === - s"http://$nodeHost:8042/node/containerlogs/$containerId/username/stdout?start=-4096") - assert(logLinkMap("stderr") === - s"http://$nodeHost:8042/node/containerlogs/$containerId/username/stderr?start=-4096") - } - - test("SPARK-33185 Parse YARN AppAttempts invalid JSON response") { - // No "appAttempt" present - assert(Client.parseAppAttemptsJsonResponse("""{"appAttempts": { } }""") === Map()) - - // "appAttempt" is empty - assert(Client.parseAppAttemptsJsonResponse("""{"appAttempts": { "appAttempt": [ ] } }""") - === Map()) - - // logsLink is missing - assert(Client.parseAppAttemptsJsonResponse("""{"appAttempts":{"appAttempt":[{"id":1}]}}""") - === Map()) - - // logsLink is present but empty - assert( - Client.parseAppAttemptsJsonResponse("""{"appAttempts":{"appAttempt":[{"logsLink":""}]}}""") - === Map()) - } - private val matching = Seq( ("files URI match test1", "file:///file1", "file:///file2"), ("files URI match test2", "file:///c:file1", "file://c:file2"), diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala index cf754cca315f0..222b24ca12dce 100644 --- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala +++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala @@ -230,6 +230,37 @@ class YarnClusterSuite extends BaseYarnClusterSuite { } } + test("running Spark in yarn-cluster mode displays driver log links") { + val log4jConf = new File(tempDir, "log4j.properties") + val logOutFile = new File(tempDir, "logs") + Files.write( + s"""log4j.rootCategory=DEBUG,file + |log4j.appender.file=org.apache.log4j.FileAppender + |log4j.appender.file.file=$logOutFile + |log4j.appender.file.layout=org.apache.log4j.PatternLayout + |""".stripMargin, + log4jConf, StandardCharsets.UTF_8) + // Since this test is trying to extract log output from the SparkSubmit process itself, + // standard options to the Spark process don't take effect. Leverage the java-opts file which + // will get picked up for the SparkSubmit process. + val confDir = new File(tempDir, "conf") + confDir.mkdir() + val javaOptsFile = new File(confDir, "java-opts") + Files.write(s"-Dlog4j.configuration=file://$log4jConf\n", javaOptsFile, StandardCharsets.UTF_8) + + val result = File.createTempFile("result", null, tempDir) + val finalState = runSpark(clientMode = false, + mainClassName(YarnClusterDriver.getClass), + appArgs = Seq(result.getAbsolutePath), + extraEnv = Map("SPARK_CONF_DIR" -> confDir.getAbsolutePath), + extraConf = Map(CLIENT_INCLUDE_DRIVER_LOGS_LINK.key -> true.toString)) + checkResult(finalState, result) + val logOutput = Files.toString(logOutFile, StandardCharsets.UTF_8) + val logFilePattern = raw"""(?s).+\sDriver Logs \(\): https?://.+/(\?\S+)?\s.+""" + logOutput should fullyMatch regex logFilePattern.replace("", "stdout") + logOutput should fullyMatch regex logFilePattern.replace("", "stderr") + } + test("timeout to get SparkContext in cluster mode triggers failure") { val timeout = 2000 val finalState = runSpark(false, mainClassName(SparkContextTimeoutApp.getClass), From c6994354f70061b2a15445dbd298a2db926b548c Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Mon, 30 Nov 2020 13:29:50 -0800 Subject: [PATCH 053/150] [SPARK-33545][CORE] Support Fallback Storage during Worker decommission ### What changes were proposed in this pull request? This PR aims to support storage migration to the fallback storage like cloud storage (`S3`) during worker decommission for the corner cases where the exceptions occur or there is no live peer left. Although this PR focuses on cloud storage like `S3` which has a TTL feature in order to simplify Spark's logic, we can use alternative fallback storages like HDFS/NFS(EFS) if the user provides a clean-up mechanism. ### Why are the changes needed? Currently, storage migration is not possible when there is no available executor. For example, when there is one executor, the executor cannot perform storage migration because it has no peer. ### Does this PR introduce _any_ user-facing change? Yes. This is a new feature. ### How was this patch tested? Pass the CIs with newly added test cases. Closes #30492 from dongjoon-hyun/SPARK-33545. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun --- core/pom.xml | 41 +++ .../scala/org/apache/spark/SparkContext.scala | 1 + .../spark/internal/config/package.scala | 10 + .../shuffle/IndexShuffleBlockResolver.scala | 2 +- .../apache/spark/storage/BlockManager.scala | 18 +- .../storage/BlockManagerDecommissioner.scala | 3 + .../spark/storage/FallbackStorage.scala | 174 +++++++++++ .../storage/ShuffleBlockFetcherIterator.scala | 3 +- .../spark/storage/FallbackStorageSuite.scala | 269 ++++++++++++++++++ 9 files changed, 517 insertions(+), 4 deletions(-) create mode 100644 core/src/main/scala/org/apache/spark/storage/FallbackStorage.scala create mode 100644 core/src/test/scala/org/apache/spark/storage/FallbackStorageSuite.scala diff --git a/core/pom.xml b/core/pom.xml index 7a56c4ca3c638..9d2bf7dbe57a9 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -461,6 +461,47 @@ test + + org.apache.hadoop + hadoop-aws + ${hadoop.version} + test + + + org.apache.hadoop + hadoop-common + + + commons-logging + commons-logging + + + org.codehaus.jackson + jackson-mapper-asl + + + org.codehaus.jackson + jackson-core-asl + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-databind + + + com.fasterxml.jackson.core + jackson-annotations + + + + com.amazonaws + aws-java-sdk + + + org.apache.commons commons-crypto diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 0440a9de6ab31..b953592fa04dc 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -576,6 +576,7 @@ class SparkContext(config: SparkConf) extends Logging { } _ui.foreach(_.setAppId(_applicationId)) _env.blockManager.initialize(_applicationId) + FallbackStorage.registerBlockManagerIfNeeded(_env.blockManager.master, _conf) // The metrics system for Driver need to be set spark.app.id to app ID. // So it should start after we get app ID from the task scheduler and set spark.app.id. diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index b8bcb374ef961..093a0ecf58d32 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -471,6 +471,16 @@ package object config { "cache block replication should be positive.") .createWithDefaultString("30s") + private[spark] val STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH = + ConfigBuilder("spark.storage.decommission.fallbackStorage.path") + .doc("The location for fallback storage during block manager decommissioning. " + + "For example, `s3a://spark-storage/`. In case of empty, fallback storage is disabled. " + + "The storage should be managed by TTL because Spark will not clean it up.") + .version("3.1.0") + .stringConf + .checkValue(_.endsWith(java.io.File.separator), "Path should end with separator.") + .createOptional + private[spark] val STORAGE_REPLICATION_TOPOLOGY_FILE = ConfigBuilder("spark.storage.replication.topologyFile") .version("2.1.0") diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala index e5df27c0d3c7a..5f0bb42108c56 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala @@ -91,7 +91,7 @@ private[spark] class IndexShuffleBlockResolver( * When the dirs parameter is None then use the disk manager's local directories. Otherwise, * read from the specified directories. */ - private def getIndexFile( + def getIndexFile( shuffleId: Int, mapId: Long, dirs: Option[Array[String]] = None): File = { diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index 072702b343328..a5b8d5d0c8cda 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -627,7 +627,16 @@ private[spark] class BlockManager( override def getLocalBlockData(blockId: BlockId): ManagedBuffer = { if (blockId.isShuffle) { logDebug(s"Getting local shuffle block ${blockId}") - shuffleManager.shuffleBlockResolver.getBlockData(blockId) + try { + shuffleManager.shuffleBlockResolver.getBlockData(blockId) + } catch { + case e: IOException => + if (conf.get(config.STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH).isDefined) { + FallbackStorage.read(conf, blockId) + } else { + throw e + } + } } else { getLocalBytes(blockId) match { case Some(blockData) => @@ -1580,7 +1589,12 @@ private[spark] class BlockManager( lastPeerFetchTimeNs = System.nanoTime() logDebug("Fetched peers from master: " + cachedPeers.mkString("[", ",", "]")) } - cachedPeers + if (cachedPeers.isEmpty && + conf.get(config.STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH).isDefined) { + Seq(FallbackStorage.FALLBACK_BLOCK_MANAGER_ID) + } else { + cachedPeers + } } } diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerDecommissioner.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerDecommissioner.scala index 7a55039db1b60..e73e359a70f1e 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerDecommissioner.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerDecommissioner.scala @@ -39,6 +39,7 @@ private[storage] class BlockManagerDecommissioner( conf: SparkConf, bm: BlockManager) extends Logging { + private val fallbackStorage = FallbackStorage.getFallbackStorage(conf) private val maxReplicationFailuresForDecommission = conf.get(config.STORAGE_DECOMMISSION_MAX_REPLICATION_FAILURE_PER_BLOCK) @@ -114,6 +115,8 @@ private[storage] class BlockManagerDecommissioner( // driver a no longer referenced RDD with shuffle files. if (bm.migratableResolver.getMigrationBlocks(shuffleBlockInfo).isEmpty) { logWarning(s"Skipping block ${shuffleBlockInfo}, block deleted.") + } else if (fallbackStorage.isDefined) { + fallbackStorage.foreach(_.copy(shuffleBlockInfo, bm)) } else { throw e } diff --git a/core/src/main/scala/org/apache/spark/storage/FallbackStorage.scala b/core/src/main/scala/org/apache/spark/storage/FallbackStorage.scala new file mode 100644 index 0000000000000..9221731f77a59 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/storage/FallbackStorage.scala @@ -0,0 +1,174 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.storage + +import java.io.DataInputStream +import java.nio.ByteBuffer + +import scala.concurrent.Future +import scala.reflect.ClassTag + +import org.apache.hadoop.fs.{FileSystem, Path} + +import org.apache.spark.SparkConf +import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.internal.Logging +import org.apache.spark.internal.config.STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH +import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} +import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef, RpcTimeout} +import org.apache.spark.shuffle.{IndexShuffleBlockResolver, ShuffleBlockInfo} +import org.apache.spark.shuffle.IndexShuffleBlockResolver.NOOP_REDUCE_ID +import org.apache.spark.util.Utils + +/** + * A fallback storage used by storage decommissioners. + */ +private[storage] class FallbackStorage(conf: SparkConf) extends Logging { + require(conf.contains("spark.app.id")) + require(conf.get(STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH).isDefined) + + private val fallbackPath = new Path(conf.get(STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH).get) + private val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) + private val fallbackFileSystem = FileSystem.get(fallbackPath.toUri, hadoopConf) + private val appId = conf.getAppId + + // Visible for testing + def copy( + shuffleBlockInfo: ShuffleBlockInfo, + bm: BlockManager): Unit = { + val shuffleId = shuffleBlockInfo.shuffleId + val mapId = shuffleBlockInfo.mapId + + bm.migratableResolver match { + case r: IndexShuffleBlockResolver => + val indexFile = r.getIndexFile(shuffleId, mapId) + + if (indexFile.exists()) { + fallbackFileSystem.copyFromLocalFile( + new Path(indexFile.getAbsolutePath), + new Path(fallbackPath, s"$appId/$shuffleId/${indexFile.getName}")) + + val dataFile = r.getDataFile(shuffleId, mapId) + if (dataFile.exists()) { + fallbackFileSystem.copyFromLocalFile( + new Path(dataFile.getAbsolutePath), + new Path(fallbackPath, s"$appId/$shuffleId/${dataFile.getName}")) + } + + // Report block statuses + val reduceId = NOOP_REDUCE_ID + val indexBlockId = ShuffleIndexBlockId(shuffleId, mapId, reduceId) + FallbackStorage.reportBlockStatus(bm, indexBlockId, indexFile.length) + if (dataFile.exists) { + val dataBlockId = ShuffleDataBlockId(shuffleId, mapId, reduceId) + FallbackStorage.reportBlockStatus(bm, dataBlockId, dataFile.length) + } + } + case r => + logWarning(s"Unsupported Resolver: ${r.getClass.getName}") + } + } + + def exists(shuffleId: Int, filename: String): Boolean = { + fallbackFileSystem.exists(new Path(fallbackPath, s"$appId/$shuffleId/$filename")) + } +} + +class NoopRpcEndpointRef(conf: SparkConf) extends RpcEndpointRef(conf) { + import scala.concurrent.ExecutionContext.Implicits.global + override def address: RpcAddress = null + override def name: String = "fallback" + override def send(message: Any): Unit = {} + override def ask[T: ClassTag](message: Any, timeout: RpcTimeout): Future[T] = { + Future{true.asInstanceOf[T]} + } +} + +object FallbackStorage extends Logging { + /** We use one block manager id as a place holder. */ + val FALLBACK_BLOCK_MANAGER_ID: BlockManagerId = BlockManagerId("fallback", "remote", 7337) + + def getFallbackStorage(conf: SparkConf): Option[FallbackStorage] = { + if (conf.get(STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH).isDefined) { + Some(new FallbackStorage(conf)) + } else { + None + } + } + + /** Register the fallback block manager and its RPC endpoint. */ + def registerBlockManagerIfNeeded(master: BlockManagerMaster, conf: SparkConf): Unit = { + if (conf.get(STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH).isDefined) { + master.registerBlockManager( + FALLBACK_BLOCK_MANAGER_ID, Array.empty[String], 0, 0, new NoopRpcEndpointRef(conf)) + } + } + + /** Report block status to block manager master and map output tracker master. */ + private def reportBlockStatus(blockManager: BlockManager, blockId: BlockId, dataLength: Long) = { + assert(blockManager.master != null) + blockManager.master.updateBlockInfo( + FALLBACK_BLOCK_MANAGER_ID, blockId, StorageLevel.DISK_ONLY, memSize = 0, dataLength) + } + + /** + * Read a ManagedBuffer. + */ + def read(conf: SparkConf, blockId: BlockId): ManagedBuffer = { + logInfo(s"Read $blockId") + val fallbackPath = new Path(conf.get(STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH).get) + val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) + val fallbackFileSystem = FileSystem.get(fallbackPath.toUri, hadoopConf) + val appId = conf.getAppId + + val (shuffleId, mapId, startReduceId, endReduceId) = blockId match { + case id: ShuffleBlockId => + (id.shuffleId, id.mapId, id.reduceId, id.reduceId + 1) + case batchId: ShuffleBlockBatchId => + (batchId.shuffleId, batchId.mapId, batchId.startReduceId, batchId.endReduceId) + case _ => + throw new IllegalArgumentException("unexpected shuffle block id format: " + blockId) + } + + val name = ShuffleIndexBlockId(shuffleId, mapId, NOOP_REDUCE_ID).name + val indexFile = new Path(fallbackPath, s"$appId/$shuffleId/$name") + val start = startReduceId * 8L + val end = endReduceId * 8L + Utils.tryWithResource(fallbackFileSystem.open(indexFile)) { inputStream => + Utils.tryWithResource(new DataInputStream(inputStream)) { index => + index.skip(start) + val offset = index.readLong() + index.skip(end - (start + 8L)) + val nextOffset = index.readLong() + val name = ShuffleDataBlockId(shuffleId, mapId, NOOP_REDUCE_ID).name + val dataFile = new Path(fallbackPath, s"$appId/$shuffleId/$name") + val f = fallbackFileSystem.open(dataFile) + val size = nextOffset - 1 - offset + logDebug(s"To byte array $size") + val array = new Array[Byte](size.toInt) + val startTimeNs = System.nanoTime() + f.seek(offset) + f.read(array) + logDebug(s"Took ${(System.nanoTime() - startTimeNs) / (1000 * 1000)}ms") + f.close() + new NioManagedBuffer(ByteBuffer.wrap(array)) + } + } + } +} + diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala index e3b3fc5cc4565..fa4e46590aa5e 100644 --- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala +++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala @@ -295,8 +295,9 @@ final class ShuffleBlockFetcherIterator( var hostLocalBlockBytes = 0L var remoteBlockBytes = 0L + val fallback = FallbackStorage.FALLBACK_BLOCK_MANAGER_ID.executorId for ((address, blockInfos) <- blocksByAddress) { - if (address.executorId == blockManager.blockManagerId.executorId) { + if (Seq(blockManager.blockManagerId.executorId, fallback).contains(address.executorId)) { checkBlockSizes(blockInfos) val mergedBlockInfos = mergeContinuousShuffleBlockIdsIfNeeded( blockInfos.map(info => FetchBlockInfo(info._1, info._2, info._3)), doBatchFetch) diff --git a/core/src/test/scala/org/apache/spark/storage/FallbackStorageSuite.scala b/core/src/test/scala/org/apache/spark/storage/FallbackStorageSuite.scala new file mode 100644 index 0000000000000..2eeae2ecad5eb --- /dev/null +++ b/core/src/test/scala/org/apache/spark/storage/FallbackStorageSuite.scala @@ -0,0 +1,269 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.storage + +import java.io.{DataOutputStream, FileOutputStream, IOException} +import java.nio.file.Files + +import scala.concurrent.duration._ + +import org.mockito.{ArgumentMatchers => mc} +import org.mockito.Mockito.{mock, times, verify, when} +import org.scalatest.concurrent.Eventually.{eventually, interval, timeout} + +import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite, TestUtils} +import org.apache.spark.LocalSparkContext.withSpark +import org.apache.spark.internal.config._ +import org.apache.spark.launcher.SparkLauncher.{EXECUTOR_MEMORY, SPARK_MASTER} +import org.apache.spark.network.BlockTransferService +import org.apache.spark.network.buffer.ManagedBuffer +import org.apache.spark.scheduler.ExecutorDecommissionInfo +import org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend +import org.apache.spark.shuffle.{IndexShuffleBlockResolver, ShuffleBlockInfo} +import org.apache.spark.shuffle.IndexShuffleBlockResolver.NOOP_REDUCE_ID +import org.apache.spark.util.Utils.tryWithResource + +class FallbackStorageSuite extends SparkFunSuite with LocalSparkContext { + + def getSparkConf(initialExecutor: Int = 1, minExecutor: Int = 1): SparkConf = { + new SparkConf(false) + .setAppName(getClass.getName) + .set(SPARK_MASTER, s"local-cluster[$initialExecutor,1,1024]") + .set(EXECUTOR_MEMORY, "1g") + .set(UI.UI_ENABLED, false) + .set(DYN_ALLOCATION_ENABLED, true) + .set(DYN_ALLOCATION_SHUFFLE_TRACKING_ENABLED, true) + .set(DYN_ALLOCATION_INITIAL_EXECUTORS, initialExecutor) + .set(DYN_ALLOCATION_MIN_EXECUTORS, minExecutor) + .set(DECOMMISSION_ENABLED, true) + .set(STORAGE_DECOMMISSION_ENABLED, true) + .set(STORAGE_DECOMMISSION_SHUFFLE_BLOCKS_ENABLED, true) + .set(STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH, + Files.createTempDirectory("tmp").toFile.getAbsolutePath + "/") + } + + test("fallback storage APIs - copy/exists") { + val conf = new SparkConf(false) + .set("spark.app.id", "testId") + .set(STORAGE_DECOMMISSION_SHUFFLE_BLOCKS_ENABLED, true) + .set(STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH, + Files.createTempDirectory("tmp").toFile.getAbsolutePath + "/") + val fallbackStorage = new FallbackStorage(conf) + val bmm = new BlockManagerMaster(new NoopRpcEndpointRef(conf), null, conf, false) + + val bm = mock(classOf[BlockManager]) + val dbm = new DiskBlockManager(conf, false) + when(bm.diskBlockManager).thenReturn(dbm) + when(bm.master).thenReturn(bmm) + val resolver = new IndexShuffleBlockResolver(conf, bm) + when(bm.migratableResolver).thenReturn(resolver) + + resolver.getIndexFile(1, 1L).createNewFile() + resolver.getDataFile(1, 1L).createNewFile() + + val indexFile = resolver.getIndexFile(1, 2L) + tryWithResource(new FileOutputStream(indexFile)) { fos => + tryWithResource(new DataOutputStream(fos)) { dos => + dos.writeLong(0) + dos.writeLong(4) + } + } + + val dataFile = resolver.getDataFile(1, 2L) + tryWithResource(new FileOutputStream(dataFile)) { fos => + tryWithResource(new DataOutputStream(fos)) { dos => + dos.writeLong(0) + } + } + + fallbackStorage.copy(ShuffleBlockInfo(1, 1L), bm) + fallbackStorage.copy(ShuffleBlockInfo(1, 2L), bm) + + assert(fallbackStorage.exists(1, ShuffleIndexBlockId(1, 1L, NOOP_REDUCE_ID).name)) + assert(fallbackStorage.exists(1, ShuffleDataBlockId(1, 1L, NOOP_REDUCE_ID).name)) + assert(fallbackStorage.exists(1, ShuffleIndexBlockId(1, 2L, NOOP_REDUCE_ID).name)) + assert(fallbackStorage.exists(1, ShuffleDataBlockId(1, 2L, NOOP_REDUCE_ID).name)) + + // The files for shuffle 1 and map 1 are empty intentionally. + intercept[java.io.EOFException] { + FallbackStorage.read(conf, ShuffleBlockId(1, 1L, 0)) + } + FallbackStorage.read(conf, ShuffleBlockId(1, 2L, 0)) + } + + test("migrate shuffle data to fallback storage") { + val conf = new SparkConf(false) + .set("spark.app.id", "testId") + .set(STORAGE_DECOMMISSION_SHUFFLE_BLOCKS_ENABLED, true) + .set(STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH, + Files.createTempDirectory("tmp").toFile.getAbsolutePath + "/") + + val ids = Set((1, 1L, 1)) + val bm = mock(classOf[BlockManager]) + val dbm = new DiskBlockManager(conf, false) + when(bm.diskBlockManager).thenReturn(dbm) + val indexShuffleBlockResolver = new IndexShuffleBlockResolver(conf, bm) + val indexFile = indexShuffleBlockResolver.getIndexFile(1, 1L) + val dataFile = indexShuffleBlockResolver.getDataFile(1, 1L) + indexFile.createNewFile() + dataFile.createNewFile() + + val resolver = mock(classOf[IndexShuffleBlockResolver]) + when(resolver.getStoredShuffles()) + .thenReturn(ids.map(triple => ShuffleBlockInfo(triple._1, triple._2)).toSeq) + ids.foreach { case (shuffleId: Int, mapId: Long, reduceId: Int) => + when(resolver.getMigrationBlocks(mc.any())) + .thenReturn(List( + (ShuffleIndexBlockId(shuffleId, mapId, reduceId), mock(classOf[ManagedBuffer])), + (ShuffleDataBlockId(shuffleId, mapId, reduceId), mock(classOf[ManagedBuffer])))) + when(resolver.getIndexFile(shuffleId, mapId)).thenReturn(indexFile) + when(resolver.getDataFile(shuffleId, mapId)).thenReturn(dataFile) + } + + when(bm.getPeers(mc.any())) + .thenReturn(Seq(FallbackStorage.FALLBACK_BLOCK_MANAGER_ID)) + val bmm = new BlockManagerMaster(new NoopRpcEndpointRef(conf), null, conf, false) + when(bm.master).thenReturn(bmm) + val blockTransferService = mock(classOf[BlockTransferService]) + when(blockTransferService.uploadBlockSync(mc.any(), mc.any(), mc.any(), mc.any(), mc.any(), + mc.any(), mc.any())).thenThrow(new IOException) + when(bm.blockTransferService).thenReturn(blockTransferService) + when(bm.migratableResolver).thenReturn(resolver) + when(bm.getMigratableRDDBlocks()).thenReturn(Seq()) + + val decommissioner = new BlockManagerDecommissioner(conf, bm) + + try { + decommissioner.start() + val fallbackStorage = new FallbackStorage(conf) + eventually(timeout(10.second), interval(1.seconds)) { + // uploadBlockSync is not used + verify(blockTransferService, times(1)) + .uploadBlockSync(mc.any(), mc.any(), mc.any(), mc.any(), mc.any(), mc.any(), mc.any()) + + Seq("shuffle_1_1_0.index", "shuffle_1_1_0.data").foreach { filename => + assert(fallbackStorage.exists(shuffleId = 1, filename)) + } + } + } finally { + decommissioner.stop() + } + } + + test("Upload from all decommissioned executors") { + sc = new SparkContext(getSparkConf(2, 2)) + withSpark(sc) { sc => + TestUtils.waitUntilExecutorsUp(sc, 2, 60000) + val rdd1 = sc.parallelize(1 to 10, 10) + val rdd2 = rdd1.map(x => (x % 2, 1)) + val rdd3 = rdd2.reduceByKey(_ + _) + assert(rdd3.count() === 2) + + // Decommission all + val sched = sc.schedulerBackend.asInstanceOf[StandaloneSchedulerBackend] + sc.getExecutorIds().foreach { + sched.decommissionExecutor(_, ExecutorDecommissionInfo(""), false) + } + + val files = Seq("shuffle_0_0_0.index", "shuffle_0_0_0.data") + val fallbackStorage = new FallbackStorage(sc.getConf) + // Uploading is not started yet. + files.foreach { file => assert(!fallbackStorage.exists(0, file)) } + + // Uploading is completed on decommissioned executors + eventually(timeout(20.seconds), interval(1.seconds)) { + files.foreach { file => assert(fallbackStorage.exists(0, file)) } + } + + // All executors are still alive. + assert(sc.getExecutorIds().size == 2) + } + } + + test("Upload multi stages") { + sc = new SparkContext(getSparkConf()) + withSpark(sc) { sc => + TestUtils.waitUntilExecutorsUp(sc, 1, 60000) + val rdd1 = sc.parallelize(1 to 10, 2) + val rdd2 = rdd1.map(x => (x % 2, 1)) + val rdd3 = rdd2.reduceByKey(_ + _) + val rdd4 = rdd3.sortByKey() + assert(rdd4.count() === 2) + + val shuffle0_files = Seq( + "shuffle_0_0_0.index", "shuffle_0_0_0.data", + "shuffle_0_1_0.index", "shuffle_0_1_0.data") + val shuffle1_files = Seq( + "shuffle_1_4_0.index", "shuffle_1_4_0.data", + "shuffle_1_5_0.index", "shuffle_1_5_0.data") + val fallbackStorage = new FallbackStorage(sc.getConf) + shuffle0_files.foreach { file => assert(!fallbackStorage.exists(0, file)) } + shuffle1_files.foreach { file => assert(!fallbackStorage.exists(1, file)) } + + // Decommission all + val sched = sc.schedulerBackend.asInstanceOf[StandaloneSchedulerBackend] + sc.getExecutorIds().foreach { + sched.decommissionExecutor(_, ExecutorDecommissionInfo(""), false) + } + + eventually(timeout(10.seconds), interval(1.seconds)) { + shuffle0_files.foreach { file => assert(fallbackStorage.exists(0, file)) } + shuffle1_files.foreach { file => assert(fallbackStorage.exists(1, file)) } + } + } + } + + test("Newly added executors should access old data from remote storage") { + sc = new SparkContext(getSparkConf(2, 0)) + withSpark(sc) { sc => + TestUtils.waitUntilExecutorsUp(sc, 2, 60000) + val rdd1 = sc.parallelize(1 to 10, 2) + val rdd2 = rdd1.map(x => (x % 2, 1)) + val rdd3 = rdd2.reduceByKey(_ + _) + assert(rdd3.collect() === Array((0, 5), (1, 5))) + + // Decommission all + val sched = sc.schedulerBackend.asInstanceOf[StandaloneSchedulerBackend] + sc.getExecutorIds().foreach { + sched.decommissionExecutor(_, ExecutorDecommissionInfo(""), false) + } + + // Make it sure that fallback storage are ready + val fallbackStorage = new FallbackStorage(sc.getConf) + eventually(timeout(10.seconds), interval(1.seconds)) { + Seq( + "shuffle_0_0_0.index", "shuffle_0_0_0.data", + "shuffle_0_1_0.index", "shuffle_0_1_0.data").foreach { file => + assert(fallbackStorage.exists(0, file)) + } + } + + // Since the data is safe, force to shrink down to zero executor + sc.getExecutorIds().foreach { id => + sched.killExecutor(id) + } + eventually(timeout(20.seconds), interval(1.seconds)) { + assert(sc.getExecutorIds().isEmpty) + } + + // Dynamic allocation will start new executors + assert(rdd3.collect() === Array((0, 5), (1, 5))) + assert(rdd3.sortByKey().count() == 2) + assert(sc.getExecutorIds().nonEmpty) + } + } +} From f5d2165c95fe83f24be9841807613950c1d5d6d0 Mon Sep 17 00:00:00 2001 From: "Jungtaek Lim (HeartSaVioR)" Date: Tue, 1 Dec 2020 06:44:15 +0900 Subject: [PATCH 054/150] [SPARK-33440][CORE] Use current timestamp with warning log in HadoopFSDelegationTokenProvider when the issue date for token is not set up properly ### What changes were proposed in this pull request? This PR proposes to use current timestamp with warning log when the issue date for token is not set up properly. The next section will explain the rationalization with details. ### Why are the changes needed? Unfortunately not every implementations respect the `issue date` in `AbstractDelegationTokenIdentifier`, which Spark relies on while calculating. The default value of issue date is 0L, which is far from actual issue date, breaking logic on calculating next renewal date under some circumstance, leading to 0 interval (immediate) on rescheduling token renewal. In HadoopFSDelegationTokenProvider, Spark calculates token renewal interval as below: https://github.com/apache/spark/blob/2c64b731ae6a976b0d75a95901db849b4a0e2393/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala#L123-L134 The interval is calculated as `token.renew() - identifier.getIssueDate`, which is providing correct interval assuming both `token.renew()` and `identifier.getIssueDate` produce correct value, but it's going to be weird when `identifier.getIssueDate` provides 0L (default value), like below: ``` 20/10/13 06:34:19 INFO security.HadoopFSDelegationTokenProvider: Renewal interval is 1603175657000 for token S3ADelegationToken/IDBroker 20/10/13 06:34:19 INFO security.HadoopFSDelegationTokenProvider: Renewal interval is 86400048 for token HDFS_DELEGATION_TOKEN ``` Hopefully we pick the minimum value as safety guard (so in this case, `86400048` is being picked up), but the safety guard leads unintentional bad impact on this case. https://github.com/apache/spark/blob/2c64b731ae6a976b0d75a95901db849b4a0e2393/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala#L58-L71 Spark leverages the interval being calculated in above, "minimum" value of intervals, and blindly adds the value to token's issue date to calculates the next renewal date for the token, and picks "minimum" value again. In problematic case, the value would be `86400048` (86400048 + 0) which is quite smaller than current timestamp. https://github.com/apache/spark/blob/2c64b731ae6a976b0d75a95901db849b4a0e2393/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala#L228-L234 The next renewal date is subtracted with current timestamp again to get the interval, and multiplexed by configured ratio to produce the final schedule interval. In problematic case, this value goes to negative. https://github.com/apache/spark/blob/2c64b731ae6a976b0d75a95901db849b4a0e2393/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala#L180-L188 There's a safety guard to not allow negative value, but that's simply 0 meaning schedule immediately. This triggers next calculation of next renewal date to calculate the schedule interval, lead to the same behavior, hence updating delegation token immediately and continuously. As we fetch token just before the calculation happens, the actual issue date is likely slightly before, hence it's not that dangerous to use current timestamp as issue date for the token the issue date has not been set up properly. Still, it's better not to leave the token implementation as it is, so we log warn message to let end users consult with token implementer. ### Does this PR introduce _any_ user-facing change? Yes. End users won't encounter the tight loop of schedule of token renewal after the PR. In end users' perspective of reflection, there's nothing end users need to change. ### How was this patch tested? Manually tested with problematic environment. Closes #30366 from HeartSaVioR/SPARK-33440. Authored-by: Jungtaek Lim (HeartSaVioR) Signed-off-by: Jungtaek Lim (HeartSaVioR) --- .../HadoopDelegationTokenManager.scala | 4 ++- .../HadoopFSDelegationTokenProvider.scala | 27 ++++++++++++++++--- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala index 3168c763df4df..6ce195b6c7a34 100644 --- a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala +++ b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala @@ -178,7 +178,7 @@ private[spark] class HadoopDelegationTokenManager( private def scheduleRenewal(delay: Long): Unit = { val _delay = math.max(0, delay) - logInfo(s"Scheduling renewal in ${UIUtils.formatDuration(delay)}.") + logInfo(s"Scheduling renewal in ${UIUtils.formatDuration(_delay)}.") val renewalTask = new Runnable() { override def run(): Unit = { @@ -230,6 +230,8 @@ private[spark] class HadoopDelegationTokenManager( val now = System.currentTimeMillis val ratio = sparkConf.get(CREDENTIALS_RENEWAL_INTERVAL_RATIO) val delay = (ratio * (nextRenewal - now)).toLong + logInfo(s"Calculated delay on renewal is $delay, based on next renewal $nextRenewal " + + s"and the ratio $ratio, and current time $now") scheduleRenewal(delay) creds } diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala b/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala index a46864e2d3c9c..0dc6aa1d7ef30 100644 --- a/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProvider.scala @@ -63,7 +63,8 @@ private[deploy] class HadoopFSDelegationTokenProvider val identifier = token .decodeIdentifier() .asInstanceOf[AbstractDelegationTokenIdentifier] - identifier.getIssueDate + interval + val tokenKind = token.getKind.toString + getIssueDate(tokenKind, identifier) + interval } if (nextRenewalDates.isEmpty) None else Some(nextRenewalDates.min) } @@ -126,13 +127,33 @@ private[deploy] class HadoopFSDelegationTokenProvider Try { val newExpiration = token.renew(hadoopConf) val identifier = token.decodeIdentifier().asInstanceOf[AbstractDelegationTokenIdentifier] - val interval = newExpiration - identifier.getIssueDate - logInfo(s"Renewal interval is $interval for token ${token.getKind.toString}") + val tokenKind = token.getKind.toString + val interval = newExpiration - getIssueDate(tokenKind, identifier) + logInfo(s"Renewal interval is $interval for token $tokenKind") interval }.toOption } if (renewIntervals.isEmpty) None else Some(renewIntervals.min) } + + private def getIssueDate(kind: String, identifier: AbstractDelegationTokenIdentifier): Long = { + val now = System.currentTimeMillis() + val issueDate = identifier.getIssueDate + if (issueDate > now) { + logWarning(s"Token $kind has set up issue date later than current time. (provided: " + + s"$issueDate / current timestamp: $now) Please make sure clocks are in sync between " + + "machines. If the issue is not a clock mismatch, consult token implementor to check " + + "whether issue date is valid.") + issueDate + } else if (issueDate > 0L) { + issueDate + } else { + logWarning(s"Token $kind has not set up issue date properly. (provided: $issueDate) " + + s"Using current timestamp ($now) as issue date instead. Consult token implementor to fix " + + "the behavior.") + now + } + } } private[deploy] object HadoopFSDelegationTokenProvider { From 596fbc1d292259c8850f026e2d7267056abee3bc Mon Sep 17 00:00:00 2001 From: Weichen Xu Date: Tue, 1 Dec 2020 09:52:19 +0900 Subject: [PATCH 055/150] [SPARK-33556][ML] Add array_to_vector function for dataframe column ### What changes were proposed in this pull request? Add array_to_vector function for dataframe column ### Why are the changes needed? Utility function for array to vector conversion. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? scala unit test & doctest. Closes #30498 from WeichenXu123/array_to_vec. Lead-authored-by: Weichen Xu Co-authored-by: Hyukjin Kwon Signed-off-by: HyukjinKwon --- .../scala/org/apache/spark/ml/functions.scala | 16 ++++++++- .../org/apache/spark/ml/FunctionsSuite.scala | 18 ++++++++-- python/docs/source/reference/pyspark.ml.rst | 1 + python/pyspark/ml/functions.py | 34 +++++++++++++++++++ python/pyspark/ml/functions.pyi | 2 ++ 5 files changed, 68 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/functions.scala b/mllib/src/main/scala/org/apache/spark/ml/functions.scala index a0b6d11a46be9..43622a4f3edfb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/functions.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/functions.scala @@ -18,7 +18,7 @@ package org.apache.spark.ml import org.apache.spark.annotation.Since -import org.apache.spark.ml.linalg.{SparseVector, Vector} +import org.apache.spark.ml.linalg.{SparseVector, Vector, Vectors} import org.apache.spark.mllib.linalg.{Vector => OldVector} import org.apache.spark.sql.Column import org.apache.spark.sql.functions.udf @@ -72,6 +72,20 @@ object functions { } } + private val arrayToVectorUdf = udf { array: Seq[Double] => + Vectors.dense(array.toArray) + } + + /** + * Converts a column of array of numeric type into a column of dense vectors in MLlib. + * @param v: the column of array<NumericType> type + * @return a column of type `org.apache.spark.ml.linalg.Vector` + * @since 3.1.0 + */ + def array_to_vector(v: Column): Column = { + arrayToVectorUdf(v) + } + private[ml] def checkNonNegativeWeight = udf { value: Double => require(value >= 0, s"illegal weight value: $value. weight must be >= 0.0.") diff --git a/mllib/src/test/scala/org/apache/spark/ml/FunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/FunctionsSuite.scala index 3dd9a7d8ec85d..21b823383d233 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/FunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/FunctionsSuite.scala @@ -18,8 +18,8 @@ package org.apache.spark.ml import org.apache.spark.SparkException -import org.apache.spark.ml.functions.vector_to_array -import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.ml.functions.{array_to_vector, vector_to_array} +import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.MLTest import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.sql.functions.col @@ -87,4 +87,18 @@ class FunctionsSuite extends MLTest { assert(thrown2.getMessage.contains( s"Unsupported dtype: float16. Valid values: float64, float32.")) } + + test("test array_to_vector") { + val df1 = Seq(Tuple1(Array(0.5, 1.5))).toDF("c1") + val resultVec = df1.select(array_to_vector(col("c1"))).collect()(0)(0).asInstanceOf[Vector] + assert(resultVec === Vectors.dense(Array(0.5, 1.5))) + + val df2 = Seq(Tuple1(Array(1.5f, 2.5f))).toDF("c1") + val resultVec2 = df2.select(array_to_vector(col("c1"))).collect()(0)(0).asInstanceOf[Vector] + assert(resultVec2 === Vectors.dense(Array(1.5, 2.5))) + + val df3 = Seq(Tuple1(Array(1, 2))).toDF("c1") + val resultVec3 = df3.select(array_to_vector(col("c1"))).collect()(0)(0).asInstanceOf[Vector] + assert(resultVec3 === Vectors.dense(Array(1.0, 2.0))) + } } diff --git a/python/docs/source/reference/pyspark.ml.rst b/python/docs/source/reference/pyspark.ml.rst index 5fafe5899f20b..2de0ff65a3ae8 100644 --- a/python/docs/source/reference/pyspark.ml.rst +++ b/python/docs/source/reference/pyspark.ml.rst @@ -196,6 +196,7 @@ ML Functions .. autosummary:: :toctree: api/ + array_to_vector vector_to_array diff --git a/python/pyspark/ml/functions.py b/python/pyspark/ml/functions.py index cf4a014d897fb..fb245a3d05827 100644 --- a/python/pyspark/ml/functions.py +++ b/python/pyspark/ml/functions.py @@ -69,6 +69,40 @@ def vector_to_array(col, dtype="float64"): sc._jvm.org.apache.spark.ml.functions.vector_to_array(_to_java_column(col), dtype)) +def array_to_vector(col): + """ + Converts a column of array of numeric type into a column of dense vectors in MLlib + + .. versionadded:: 3.1.0 + + Parameters + ---------- + col : :py:class:`pyspark.sql.Column` or str + Input column + + Returns + ------- + :py:class:`pyspark.sql.Column` + The converted column of MLlib dense vectors. + + Examples + -------- + >>> from pyspark.ml.functions import array_to_vector + >>> df1 = spark.createDataFrame([([1.5, 2.5],),], schema='v1 array') + >>> df1.select(array_to_vector('v1').alias('vec1')).collect() + [Row(vec1=DenseVector([1.5, 2.5]))] + >>> df2 = spark.createDataFrame([([1.5, 3.5],),], schema='v1 array') + >>> df2.select(array_to_vector('v1').alias('vec1')).collect() + [Row(vec1=DenseVector([1.5, 3.5]))] + >>> df3 = spark.createDataFrame([([1, 3],),], schema='v1 array') + >>> df3.select(array_to_vector('v1').alias('vec1')).collect() + [Row(vec1=DenseVector([1.0, 3.0]))] + """ + sc = SparkContext._active_spark_context + return Column( + sc._jvm.org.apache.spark.ml.functions.array_to_vector(_to_java_column(col))) + + def _test(): import doctest from pyspark.sql import SparkSession diff --git a/python/pyspark/ml/functions.pyi b/python/pyspark/ml/functions.pyi index 42650e742e781..12b44fc63b5b7 100644 --- a/python/pyspark/ml/functions.pyi +++ b/python/pyspark/ml/functions.pyi @@ -20,3 +20,5 @@ from pyspark import SparkContext as SparkContext, since as since # noqa: F401 from pyspark.sql.column import Column as Column def vector_to_array(col: Column) -> Column: ... + +def array_to_vector(col: Column) -> Column: ... From aeb3649fb9103a7541ef54f451c60fcd5a091934 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 1 Dec 2020 10:34:40 +0900 Subject: [PATCH 056/150] [SPARK-33613][PYTHON][TESTS] Replace deprecated APIs in pyspark tests ### What changes were proposed in this pull request? This replaces deprecated API usage in PySpark tests with the preferred APIs. These have been deprecated for some time and usage is not consistent within tests. - https://docs.python.org/3/library/unittest.html#deprecated-aliases ### Why are the changes needed? For consistency and eventual removal of deprecated APIs. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing tests Closes #30557 from BryanCutler/replace-deprecated-apis-in-tests. Authored-by: Bryan Cutler Signed-off-by: HyukjinKwon --- python/pyspark/ml/tests/test_feature.py | 2 +- python/pyspark/ml/tests/test_image.py | 6 +- python/pyspark/ml/tests/test_param.py | 2 +- python/pyspark/ml/tests/test_persistence.py | 2 +- python/pyspark/ml/tests/test_tuning.py | 4 +- python/pyspark/ml/tests/test_wrapper.py | 6 +- python/pyspark/sql/tests/test_arrow.py | 28 ++--- python/pyspark/sql/tests/test_catalog.py | 56 ++++----- python/pyspark/sql/tests/test_column.py | 10 +- python/pyspark/sql/tests/test_conf.py | 2 +- python/pyspark/sql/tests/test_dataframe.py | 78 ++++++------- python/pyspark/sql/tests/test_datasources.py | 10 +- python/pyspark/sql/tests/test_functions.py | 22 ++-- .../sql/tests/test_pandas_cogrouped_map.py | 14 +-- .../sql/tests/test_pandas_grouped_map.py | 32 +++--- python/pyspark/sql/tests/test_pandas_map.py | 8 +- python/pyspark/sql/tests/test_pandas_udf.py | 32 +++--- .../sql/tests/test_pandas_udf_grouped_agg.py | 16 +-- .../sql/tests/test_pandas_udf_scalar.py | 108 +++++++++--------- .../sql/tests/test_pandas_udf_typehints.py | 2 +- .../sql/tests/test_pandas_udf_window.py | 6 +- python/pyspark/sql/tests/test_types.py | 24 ++-- python/pyspark/sql/tests/test_udf.py | 28 ++--- python/pyspark/sql/tests/test_utils.py | 15 ++- python/pyspark/tests/test_profiler.py | 4 +- python/pyspark/tests/test_rdd.py | 30 ++--- python/pyspark/tests/test_worker.py | 2 +- 27 files changed, 274 insertions(+), 275 deletions(-) diff --git a/python/pyspark/ml/tests/test_feature.py b/python/pyspark/ml/tests/test_feature.py index 244110a986138..98b8ce6dfb95c 100644 --- a/python/pyspark/ml/tests/test_feature.py +++ b/python/pyspark/ml/tests/test_feature.py @@ -169,7 +169,7 @@ def test_count_vectorizer_from_vocab(self): # Test an empty vocabulary with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, "vocabSize.*invalid.*0"): + with self.assertRaisesRegex(Exception, "vocabSize.*invalid.*0"): CountVectorizerModel.from_vocabulary([], inputCol="words") # Test model with default settings can transform diff --git a/python/pyspark/ml/tests/test_image.py b/python/pyspark/ml/tests/test_image.py index 1001598779d48..00e4c95a84355 100644 --- a/python/pyspark/ml/tests/test_image.py +++ b/python/pyspark/ml/tests/test_image.py @@ -47,19 +47,19 @@ def test_read_images(self): self.assertEqual(ImageSchema.undefinedImageType, "Undefined") with QuietTest(self.sc): - self.assertRaisesRegexp( + self.assertRaisesRegex( TypeError, "image argument should be pyspark.sql.types.Row; however", lambda: ImageSchema.toNDArray("a")) with QuietTest(self.sc): - self.assertRaisesRegexp( + self.assertRaisesRegex( ValueError, "image argument should have attributes specified in", lambda: ImageSchema.toNDArray(Row(a=1))) with QuietTest(self.sc): - self.assertRaisesRegexp( + self.assertRaisesRegex( TypeError, "array argument should be numpy.ndarray; however, it got", lambda: ImageSchema.toImage("a")) diff --git a/python/pyspark/ml/tests/test_param.py b/python/pyspark/ml/tests/test_param.py index 4cddf50f36bdf..09fe21e9fdeca 100644 --- a/python/pyspark/ml/tests/test_param.py +++ b/python/pyspark/ml/tests/test_param.py @@ -308,7 +308,7 @@ def test_logistic_regression_check_thresholds(self): LogisticRegression ) - self.assertRaisesRegexp( + self.assertRaisesRegex( ValueError, "Logistic Regression getThreshold found inconsistent.*$", LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5] diff --git a/python/pyspark/ml/tests/test_persistence.py b/python/pyspark/ml/tests/test_persistence.py index 826e6cd351d32..0bbcfcdf50e95 100644 --- a/python/pyspark/ml/tests/test_persistence.py +++ b/python/pyspark/ml/tests/test_persistence.py @@ -442,7 +442,7 @@ def test_default_read_write_default_params(self): del metadata['defaultParamMap'] metadataStr = json.dumps(metadata, separators=[',', ':']) loadedMetadata = reader._parseMetaData(metadataStr, ) - with self.assertRaisesRegexp(AssertionError, "`defaultParamMap` section not found"): + with self.assertRaisesRegex(AssertionError, "`defaultParamMap` section not found"): reader.getAndSetParams(lr, loadedMetadata) # Prior to 2.4.0, metadata doesn't have `defaultParamMap`. diff --git a/python/pyspark/ml/tests/test_tuning.py b/python/pyspark/ml/tests/test_tuning.py index 729e46419ae2c..ced32c07f245f 100644 --- a/python/pyspark/ml/tests/test_tuning.py +++ b/python/pyspark/ml/tests/test_tuning.py @@ -499,7 +499,7 @@ def test_invalid_user_specified_folds(self): evaluator=evaluator, numFolds=2, foldCol="fold") - with self.assertRaisesRegexp(Exception, "Fold number must be in range"): + with self.assertRaisesRegex(Exception, "Fold number must be in range"): cv.fit(dataset_with_folds) cv = CrossValidator(estimator=lr, @@ -507,7 +507,7 @@ def test_invalid_user_specified_folds(self): evaluator=evaluator, numFolds=4, foldCol="fold") - with self.assertRaisesRegexp(Exception, "The validation data at fold 3 is empty"): + with self.assertRaisesRegex(Exception, "The validation data at fold 3 is empty"): cv.fit(dataset_with_folds) diff --git a/python/pyspark/ml/tests/test_wrapper.py b/python/pyspark/ml/tests/test_wrapper.py index 31475299c7b98..8ed6a6bad95ed 100644 --- a/python/pyspark/ml/tests/test_wrapper.py +++ b/python/pyspark/ml/tests/test_wrapper.py @@ -54,7 +54,7 @@ def test_java_object_gets_detached(self): model.__del__() def condition(): - with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): + with self.assertRaisesRegex(py4j.protocol.Py4JError, error_no_object): model._java_obj.toString() self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString()) return True @@ -67,9 +67,9 @@ def condition(): pass def condition(): - with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): + with self.assertRaisesRegex(py4j.protocol.Py4JError, error_no_object): model._java_obj.toString() - with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object): + with self.assertRaisesRegex(py4j.protocol.Py4JError, error_no_object): summary._java_obj.toString() return True diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py index e764c42d88a31..bf80c62ea0542 100644 --- a/python/pyspark/sql/tests/test_arrow.py +++ b/python/pyspark/sql/tests/test_arrow.py @@ -34,7 +34,7 @@ if have_pandas: import pandas as pd - from pandas.util.testing import assert_frame_equal + from pandas.testing import assert_frame_equal if have_pyarrow: import pyarrow as pa # noqa: F401 @@ -137,7 +137,7 @@ def test_toPandas_fallback_disabled(self): df = self.spark.createDataFrame([(None,)], schema=schema) with QuietTest(self.sc): with self.warnings_lock: - with self.assertRaisesRegexp(Exception, 'Unsupported type'): + with self.assertRaisesRegex(Exception, 'Unsupported type'): df.toPandas() def test_null_conversion(self): @@ -214,7 +214,7 @@ def raise_exception(): exception_udf = udf(raise_exception, IntegerType()) df = df.withColumn("error", exception_udf()) with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, 'My error'): + with self.assertRaisesRegex(Exception, 'My error'): df.toPandas() def _createDataFrame_toggle(self, pdf, schema=None): @@ -228,7 +228,7 @@ def _createDataFrame_toggle(self, pdf, schema=None): def test_createDataFrame_toggle(self): pdf = self.create_pandas_data_frame() df_no_arrow, df_arrow = self._createDataFrame_toggle(pdf, schema=self.schema) - self.assertEquals(df_no_arrow.collect(), df_arrow.collect()) + self.assertEqual(df_no_arrow.collect(), df_arrow.collect()) def test_createDataFrame_respect_session_timezone(self): from datetime import timedelta @@ -258,7 +258,7 @@ def test_createDataFrame_respect_session_timezone(self): def test_createDataFrame_with_schema(self): pdf = self.create_pandas_data_frame() df = self.spark.createDataFrame(pdf, schema=self.schema) - self.assertEquals(self.schema, df.schema) + self.assertEqual(self.schema, df.schema) pdf_arrow = df.toPandas() assert_frame_equal(pdf_arrow, pdf) @@ -269,7 +269,7 @@ def test_createDataFrame_with_incorrect_schema(self): wrong_schema = StructType(fields) with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": False}): with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, "[D|d]ecimal.*got.*date"): + with self.assertRaisesRegex(Exception, "[D|d]ecimal.*got.*date"): self.spark.createDataFrame(pdf, schema=wrong_schema) def test_createDataFrame_with_names(self): @@ -277,23 +277,23 @@ def test_createDataFrame_with_names(self): new_names = list(map(str, range(len(self.schema.fieldNames())))) # Test that schema as a list of column names gets applied df = self.spark.createDataFrame(pdf, schema=list(new_names)) - self.assertEquals(df.schema.fieldNames(), new_names) + self.assertEqual(df.schema.fieldNames(), new_names) # Test that schema as tuple of column names gets applied df = self.spark.createDataFrame(pdf, schema=tuple(new_names)) - self.assertEquals(df.schema.fieldNames(), new_names) + self.assertEqual(df.schema.fieldNames(), new_names) def test_createDataFrame_column_name_encoding(self): pdf = pd.DataFrame({u'a': [1]}) columns = self.spark.createDataFrame(pdf).columns self.assertTrue(isinstance(columns[0], str)) - self.assertEquals(columns[0], 'a') + self.assertEqual(columns[0], 'a') columns = self.spark.createDataFrame(pdf, [u'b']).columns self.assertTrue(isinstance(columns[0], str)) - self.assertEquals(columns[0], 'b') + self.assertEqual(columns[0], 'b') def test_createDataFrame_with_single_data_type(self): with QuietTest(self.sc): - with self.assertRaisesRegexp(ValueError, ".*IntegerType.*not supported.*"): + with self.assertRaisesRegex(ValueError, ".*IntegerType.*not supported.*"): self.spark.createDataFrame(pd.DataFrame({"a": [1]}), schema="int") def test_createDataFrame_does_not_modify_input(self): @@ -311,7 +311,7 @@ def test_schema_conversion_roundtrip(self): from pyspark.sql.pandas.types import from_arrow_schema, to_arrow_schema arrow_schema = to_arrow_schema(self.schema) schema_rt = from_arrow_schema(arrow_schema) - self.assertEquals(self.schema, schema_rt) + self.assertEqual(self.schema, schema_rt) def test_createDataFrame_with_array_type(self): pdf = pd.DataFrame({"a": [[1, 2], [3, 4]], "b": [[u"x", u"y"], [u"y", u"z"]]}) @@ -420,7 +420,7 @@ def test_createDataFrame_fallback_enabled(self): def test_createDataFrame_fallback_disabled(self): with QuietTest(self.sc): - with self.assertRaisesRegexp(TypeError, 'Unsupported type'): + with self.assertRaisesRegex(TypeError, 'Unsupported type'): self.spark.createDataFrame( pd.DataFrame({"a": [[datetime.datetime(2015, 11, 1, 0, 30)]]}), "a: array") @@ -545,7 +545,7 @@ def tearDownClass(cls): cls.spark.stop() def test_exception_by_max_results(self): - with self.assertRaisesRegexp(Exception, "is bigger than"): + with self.assertRaisesRegex(Exception, "is bigger than"): self.spark.range(0, 10000, 1, 100).toPandas() diff --git a/python/pyspark/sql/tests/test_catalog.py b/python/pyspark/sql/tests/test_catalog.py index ca4e427a7db28..56e7c97020662 100644 --- a/python/pyspark/sql/tests/test_catalog.py +++ b/python/pyspark/sql/tests/test_catalog.py @@ -25,11 +25,11 @@ class CatalogTests(ReusedSQLTestCase): def test_current_database(self): spark = self.spark with self.database("some_db"): - self.assertEquals(spark.catalog.currentDatabase(), "default") + self.assertEqual(spark.catalog.currentDatabase(), "default") spark.sql("CREATE DATABASE some_db") spark.catalog.setCurrentDatabase("some_db") - self.assertEquals(spark.catalog.currentDatabase(), "some_db") - self.assertRaisesRegexp( + self.assertEqual(spark.catalog.currentDatabase(), "some_db") + self.assertRaisesRegex( AnalysisException, "does_not_exist", lambda: spark.catalog.setCurrentDatabase("does_not_exist")) @@ -38,10 +38,10 @@ def test_list_databases(self): spark = self.spark with self.database("some_db"): databases = [db.name for db in spark.catalog.listDatabases()] - self.assertEquals(databases, ["default"]) + self.assertEqual(databases, ["default"]) spark.sql("CREATE DATABASE some_db") databases = [db.name for db in spark.catalog.listDatabases()] - self.assertEquals(sorted(databases), ["default", "some_db"]) + self.assertEqual(sorted(databases), ["default", "some_db"]) def test_list_tables(self): from pyspark.sql.catalog import Table @@ -50,8 +50,8 @@ def test_list_tables(self): spark.sql("CREATE DATABASE some_db") with self.table("tab1", "some_db.tab2", "tab3_via_catalog"): with self.tempView("temp_tab"): - self.assertEquals(spark.catalog.listTables(), []) - self.assertEquals(spark.catalog.listTables("some_db"), []) + self.assertEqual(spark.catalog.listTables(), []) + self.assertEqual(spark.catalog.listTables("some_db"), []) spark.createDataFrame([(1, 1)]).createOrReplaceTempView("temp_tab") spark.sql("CREATE TABLE tab1 (name STRING, age INT) USING parquet") spark.sql("CREATE TABLE some_db.tab2 (name STRING, age INT) USING parquet") @@ -66,40 +66,40 @@ def test_list_tables(self): sorted(spark.catalog.listTables("default"), key=lambda t: t.name) tablesSomeDb = \ sorted(spark.catalog.listTables("some_db"), key=lambda t: t.name) - self.assertEquals(tables, tablesDefault) - self.assertEquals(len(tables), 3) - self.assertEquals(len(tablesSomeDb), 2) - self.assertEquals(tables[0], Table( + self.assertEqual(tables, tablesDefault) + self.assertEqual(len(tables), 3) + self.assertEqual(len(tablesSomeDb), 2) + self.assertEqual(tables[0], Table( name="tab1", database="default", description=None, tableType="MANAGED", isTemporary=False)) - self.assertEquals(tables[1], Table( + self.assertEqual(tables[1], Table( name="tab3_via_catalog", database="default", description=description, tableType="MANAGED", isTemporary=False)) - self.assertEquals(tables[2], Table( + self.assertEqual(tables[2], Table( name="temp_tab", database=None, description=None, tableType="TEMPORARY", isTemporary=True)) - self.assertEquals(tablesSomeDb[0], Table( + self.assertEqual(tablesSomeDb[0], Table( name="tab2", database="some_db", description=None, tableType="MANAGED", isTemporary=False)) - self.assertEquals(tablesSomeDb[1], Table( + self.assertEqual(tablesSomeDb[1], Table( name="temp_tab", database=None, description=None, tableType="TEMPORARY", isTemporary=True)) - self.assertRaisesRegexp( + self.assertRaisesRegex( AnalysisException, "does_not_exist", lambda: spark.catalog.listTables("does_not_exist")) @@ -119,12 +119,12 @@ def test_list_functions(self): self.assertTrue("to_timestamp" in functions) self.assertTrue("to_unix_timestamp" in functions) self.assertTrue("current_database" in functions) - self.assertEquals(functions["+"], Function( + self.assertEqual(functions["+"], Function( name="+", description=None, className="org.apache.spark.sql.catalyst.expressions.Add", isTemporary=True)) - self.assertEquals(functions, functionsDefault) + self.assertEqual(functions, functionsDefault) with self.function("func1", "some_db.func2"): spark.catalog.registerFunction("temp_func", lambda x: str(x)) @@ -141,7 +141,7 @@ def test_list_functions(self): self.assertTrue("temp_func" in newFunctionsSomeDb) self.assertTrue("func1" not in newFunctionsSomeDb) self.assertTrue("func2" in newFunctionsSomeDb) - self.assertRaisesRegexp( + self.assertRaisesRegex( AnalysisException, "does_not_exist", lambda: spark.catalog.listFunctions("does_not_exist")) @@ -158,16 +158,16 @@ def test_list_columns(self): columns = sorted(spark.catalog.listColumns("tab1"), key=lambda c: c.name) columnsDefault = \ sorted(spark.catalog.listColumns("tab1", "default"), key=lambda c: c.name) - self.assertEquals(columns, columnsDefault) - self.assertEquals(len(columns), 2) - self.assertEquals(columns[0], Column( + self.assertEqual(columns, columnsDefault) + self.assertEqual(len(columns), 2) + self.assertEqual(columns[0], Column( name="age", description=None, dataType="int", nullable=True, isPartition=False, isBucket=False)) - self.assertEquals(columns[1], Column( + self.assertEqual(columns[1], Column( name="name", description=None, dataType="string", @@ -176,26 +176,26 @@ def test_list_columns(self): isBucket=False)) columns2 = \ sorted(spark.catalog.listColumns("tab2", "some_db"), key=lambda c: c.name) - self.assertEquals(len(columns2), 2) - self.assertEquals(columns2[0], Column( + self.assertEqual(len(columns2), 2) + self.assertEqual(columns2[0], Column( name="nickname", description=None, dataType="string", nullable=True, isPartition=False, isBucket=False)) - self.assertEquals(columns2[1], Column( + self.assertEqual(columns2[1], Column( name="tolerance", description=None, dataType="float", nullable=True, isPartition=False, isBucket=False)) - self.assertRaisesRegexp( + self.assertRaisesRegex( AnalysisException, "tab2", lambda: spark.catalog.listColumns("tab2")) - self.assertRaisesRegexp( + self.assertRaisesRegex( AnalysisException, "does_not_exist", lambda: spark.catalog.listColumns("does_not_exist")) diff --git a/python/pyspark/sql/tests/test_column.py b/python/pyspark/sql/tests/test_column.py index 4a9c7106a12b0..2ae0a9bedd67d 100644 --- a/python/pyspark/sql/tests/test_column.py +++ b/python/pyspark/sql/tests/test_column.py @@ -47,7 +47,7 @@ def test_validate_column_types(self): self.assertTrue("Column" in _to_java_column(u"a").getClass().toString()) self.assertTrue("Column" in _to_java_column(self.spark.range(1).id).getClass().toString()) - self.assertRaisesRegexp( + self.assertRaisesRegex( TypeError, "Invalid argument, not a string or column", lambda: _to_java_column(1)) @@ -58,7 +58,7 @@ class A(): self.assertRaises(TypeError, lambda: _to_java_column(A())) self.assertRaises(TypeError, lambda: _to_java_column([])) - self.assertRaisesRegexp( + self.assertRaisesRegex( TypeError, "Invalid argument, not a string or column", lambda: udf(lambda x: x)(None)) @@ -79,9 +79,9 @@ def test_column_operators(self): cs.startswith('a'), cs.endswith('a'), ci.eqNullSafe(cs) self.assertTrue(all(isinstance(c, Column) for c in css)) self.assertTrue(isinstance(ci.cast(LongType()), Column)) - self.assertRaisesRegexp(ValueError, - "Cannot apply 'in' operator against a column", - lambda: 1 in cs) + self.assertRaisesRegex(ValueError, + "Cannot apply 'in' operator against a column", + lambda: 1 in cs) def test_column_accessor(self): from pyspark.sql.functions import col diff --git a/python/pyspark/sql/tests/test_conf.py b/python/pyspark/sql/tests/test_conf.py index 1cc0c1b7562c5..9222e2b8272d6 100644 --- a/python/pyspark/sql/tests/test_conf.py +++ b/python/pyspark/sql/tests/test_conf.py @@ -28,7 +28,7 @@ def test_conf(self): self.assertEqual(spark.conf.get("bogo"), "ta") self.assertEqual(spark.conf.get("bogo", "not.read"), "ta") self.assertEqual(spark.conf.get("not.set", "ta"), "ta") - self.assertRaisesRegexp(Exception, "not.set", lambda: spark.conf.get("not.set")) + self.assertRaisesRegex(Exception, "not.set", lambda: spark.conf.get("not.set")) spark.conf.unset("bogo") self.assertEqual(spark.conf.get("bogo", "colombia"), "colombia") diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index d941707b8969f..e3977e8185180 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -343,7 +343,7 @@ def test_replace(self): self.spark.createDataFrame( [(u'Alice', 10, 80.1)], schema).replace({u"Alice": u"Bob", 10: 20}).first() - with self.assertRaisesRegexp( + with self.assertRaisesRegex( TypeError, 'value argument is required when to_replace is not a dictionary.'): self.spark.createDataFrame( @@ -390,7 +390,7 @@ def test_extended_hint_types(self): self.assertEqual(3, logical_plan.toString().count("itworks")) def test_sample(self): - self.assertRaisesRegexp( + self.assertRaisesRegex( TypeError, "should be a bool, float and number", lambda: self.spark.range(1).sample()) @@ -426,12 +426,12 @@ def test_toDF_with_schema_string(self): self.assertEqual(df.collect(), data) # number of fields must match. - self.assertRaisesRegexp(Exception, "Length of object", - lambda: rdd.toDF("key: int").collect()) + self.assertRaisesRegex(Exception, "Length of object", + lambda: rdd.toDF("key: int").collect()) # field types mismatch will cause exception at runtime. - self.assertRaisesRegexp(Exception, "FloatType can not accept", - lambda: rdd.toDF("key: float, value: string").collect()) + self.assertRaisesRegex(Exception, "FloatType can not accept", + lambda: rdd.toDF("key: float, value: string").collect()) # flat schema values will be wrapped into row. df = rdd.map(lambda row: row.key).toDF("int") @@ -491,15 +491,15 @@ def test_cache(self): spark.catalog.clearCache() self.assertFalse(spark.catalog.isCached("tab1")) self.assertFalse(spark.catalog.isCached("tab2")) - self.assertRaisesRegexp( + self.assertRaisesRegex( AnalysisException, "does_not_exist", lambda: spark.catalog.isCached("does_not_exist")) - self.assertRaisesRegexp( + self.assertRaisesRegex( AnalysisException, "does_not_exist", lambda: spark.catalog.cacheTable("does_not_exist")) - self.assertRaisesRegexp( + self.assertRaisesRegex( AnalysisException, "does_not_exist", lambda: spark.catalog.uncacheTable("does_not_exist")) @@ -523,12 +523,12 @@ def test_to_pandas(self): import numpy as np pdf = self._to_pandas() types = pdf.dtypes - self.assertEquals(types[0], np.int32) - self.assertEquals(types[1], np.object) - self.assertEquals(types[2], np.bool) - self.assertEquals(types[3], np.float32) - self.assertEquals(types[4], np.object) # datetime.date - self.assertEquals(types[5], 'datetime64[ns]') + self.assertEqual(types[0], np.int32) + self.assertEqual(types[1], np.object) + self.assertEqual(types[2], np.bool) + self.assertEqual(types[3], np.float32) + self.assertEqual(types[4], np.object) # datetime.date + self.assertEqual(types[5], 'datetime64[ns]') @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore def test_to_pandas_with_duplicated_column_names(self): @@ -540,8 +540,8 @@ def test_to_pandas_with_duplicated_column_names(self): df = self.spark.sql(sql) pdf = df.toPandas() types = pdf.dtypes - self.assertEquals(types.iloc[0], np.int32) - self.assertEquals(types.iloc[1], np.int32) + self.assertEqual(types.iloc[0], np.int32) + self.assertEqual(types.iloc[1], np.int32) @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore def test_to_pandas_on_cross_join(self): @@ -560,13 +560,13 @@ def test_to_pandas_on_cross_join(self): df = self.spark.sql(sql) pdf = df.toPandas() types = pdf.dtypes - self.assertEquals(types.iloc[0], np.int32) - self.assertEquals(types.iloc[1], np.int32) + self.assertEqual(types.iloc[0], np.int32) + self.assertEqual(types.iloc[1], np.int32) @unittest.skipIf(have_pandas, "Required Pandas was found.") def test_to_pandas_required_pandas_not_found(self): with QuietTest(self.sc): - with self.assertRaisesRegexp(ImportError, 'Pandas >= .* must be installed'): + with self.assertRaisesRegex(ImportError, 'Pandas >= .* must be installed'): self._to_pandas() @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore @@ -577,9 +577,9 @@ def test_to_pandas_avoid_astype(self): data = [(1, "foo", 16777220), (None, "bar", None)] df = self.spark.createDataFrame(data, schema) types = df.toPandas().dtypes - self.assertEquals(types[0], np.float64) # doesn't convert to np.int32 due to NaN value. - self.assertEquals(types[1], np.object) - self.assertEquals(types[2], np.float64) + self.assertEqual(types[0], np.float64) # doesn't convert to np.int32 due to NaN value. + self.assertEqual(types[1], np.object) + self.assertEqual(types[2], np.float64) @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore def test_to_pandas_from_empty_dataframe(self): @@ -675,7 +675,7 @@ def test_create_dataframe_from_pandas_with_timestamp(self): @unittest.skipIf(have_pandas, "Required Pandas was found.") def test_create_dataframe_required_pandas_not_found(self): with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ImportError, "(Pandas >= .* must be installed|No module named '?pandas'?)"): import pandas as pd @@ -688,7 +688,7 @@ def test_create_dataframe_required_pandas_not_found(self): @unittest.skipIf(not have_pandas, pandas_requirement_message) # type: ignore def test_create_dataframe_from_pandas_with_dst(self): import pandas as pd - from pandas.util.testing import assert_frame_equal + from pandas.testing import assert_frame_equal from datetime import datetime pdf = pd.DataFrame({'time': [datetime(2015, 10, 31, 22, 30)]}) @@ -724,7 +724,7 @@ def test_repr_behaviors(self): ||22222|22222| |+-----+-----+ |""" - self.assertEquals(re.sub(pattern, '', expected1), df.__repr__()) + self.assertEqual(re.sub(pattern, '', expected1), df.__repr__()) with self.sql_conf({"spark.sql.repl.eagerEval.truncate": 3}): expected2 = """+---+-----+ ||key|value| @@ -733,7 +733,7 @@ def test_repr_behaviors(self): ||222| 222| |+---+-----+ |""" - self.assertEquals(re.sub(pattern, '', expected2), df.__repr__()) + self.assertEqual(re.sub(pattern, '', expected2), df.__repr__()) with self.sql_conf({"spark.sql.repl.eagerEval.maxNumRows": 1}): expected3 = """+---+-----+ ||key|value| @@ -742,7 +742,7 @@ def test_repr_behaviors(self): |+---+-----+ |only showing top 1 row |""" - self.assertEquals(re.sub(pattern, '', expected3), df.__repr__()) + self.assertEqual(re.sub(pattern, '', expected3), df.__repr__()) # test when eager evaluation is enabled and _repr_html_ will be called with self.sql_conf({"spark.sql.repl.eagerEval.enabled": True}): @@ -752,7 +752,7 @@ def test_repr_behaviors(self): | |
      2222222222
      |""" - self.assertEquals(re.sub(pattern, '', expected1), df._repr_html_()) + self.assertEqual(re.sub(pattern, '', expected1), df._repr_html_()) with self.sql_conf({"spark.sql.repl.eagerEval.truncate": 3}): expected2 = """ | @@ -760,7 +760,7 @@ def test_repr_behaviors(self): | |
      keyvalue
      222222
      |""" - self.assertEquals(re.sub(pattern, '', expected2), df._repr_html_()) + self.assertEqual(re.sub(pattern, '', expected2), df._repr_html_()) with self.sql_conf({"spark.sql.repl.eagerEval.maxNumRows": 1}): expected3 = """ | @@ -768,19 +768,19 @@ def test_repr_behaviors(self): |
      keyvalue
      |only showing top 1 row |""" - self.assertEquals(re.sub(pattern, '', expected3), df._repr_html_()) + self.assertEqual(re.sub(pattern, '', expected3), df._repr_html_()) # test when eager evaluation is disabled and _repr_html_ will be called with self.sql_conf({"spark.sql.repl.eagerEval.enabled": False}): expected = "DataFrame[key: bigint, value: string]" - self.assertEquals(None, df._repr_html_()) - self.assertEquals(expected, df.__repr__()) + self.assertEqual(None, df._repr_html_()) + self.assertEqual(expected, df.__repr__()) with self.sql_conf({"spark.sql.repl.eagerEval.truncate": 3}): - self.assertEquals(None, df._repr_html_()) - self.assertEquals(expected, df.__repr__()) + self.assertEqual(None, df._repr_html_()) + self.assertEqual(expected, df.__repr__()) with self.sql_conf({"spark.sql.repl.eagerEval.maxNumRows": 1}): - self.assertEquals(None, df._repr_html_()) - self.assertEquals(expected, df.__repr__()) + self.assertEqual(None, df._repr_html_()) + self.assertEqual(expected, df.__repr__()) def test_to_local_iterator(self): df = self.spark.range(8, numPartitions=4) @@ -818,7 +818,7 @@ def test_to_local_iterator_not_fully_consumed(self): def test_same_semantics_error(self): with QuietTest(self.sc): - with self.assertRaisesRegexp(ValueError, "should be of DataFrame.*int"): + with self.assertRaisesRegex(ValueError, "should be of DataFrame.*int"): self.spark.range(10).sameSemantics(1) def test_input_files(self): @@ -830,7 +830,7 @@ def test_input_files(self): input_files_list = self.spark.read.parquet(tpath).inputFiles() # input files list should contain 10 entries - self.assertEquals(len(input_files_list), 10) + self.assertEqual(len(input_files_list), 10) # all file paths in list must contain tpath for file_path in input_files_list: self.assertTrue(tpath in file_path) diff --git a/python/pyspark/sql/tests/test_datasources.py b/python/pyspark/sql/tests/test_datasources.py index 9425494fb0d90..26a6c58dbad6b 100644 --- a/python/pyspark/sql/tests/test_datasources.py +++ b/python/pyspark/sql/tests/test_datasources.py @@ -107,7 +107,7 @@ def test_read_text_file_list(self): df = self.spark.read.text(['python/test_support/sql/text-test.txt', 'python/test_support/sql/text-test.txt']) count = df.count() - self.assertEquals(count, 4) + self.assertEqual(count, 4) def test_json_sampling_ratio(self): rdd = self.spark.sparkContext.range(0, 100, 1, 1) \ @@ -115,14 +115,14 @@ def test_json_sampling_ratio(self): schema = self.spark.read.option('inferSchema', True) \ .option('samplingRatio', 0.5) \ .json(rdd).schema - self.assertEquals(schema, StructType([StructField("a", LongType(), True)])) + self.assertEqual(schema, StructType([StructField("a", LongType(), True)])) def test_csv_sampling_ratio(self): rdd = self.spark.sparkContext.range(0, 100, 1, 1) \ .map(lambda x: '0.1' if x == 1 else str(x)) schema = self.spark.read.option('inferSchema', True)\ .csv(rdd, samplingRatio=0.5).schema - self.assertEquals(schema, StructType([StructField("_c0", IntegerType(), True)])) + self.assertEqual(schema, StructType([StructField("_c0", IntegerType(), True)])) def test_checking_csv_header(self): path = tempfile.mkdtemp() @@ -135,7 +135,7 @@ def test_checking_csv_header(self): StructField('f1', IntegerType(), nullable=True)]) df = self.spark.read.option('header', 'true').schema(schema)\ .csv(path, enforceSchema=False) - self.assertRaisesRegexp( + self.assertRaisesRegex( Exception, "CSV header does not conform to the schema", lambda: df.collect()) @@ -154,7 +154,7 @@ def test_ignore_column_of_all_nulls(self): StructField('b', LongType(), nullable=True), StructField('c', StringType(), nullable=True)]) readback = self.spark.read.json(path, dropFieldIfAllNull=True) - self.assertEquals(readback.schema, schema) + self.assertEqual(readback.schema, schema) finally: shutil.rmtree(path) diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 2858bdeca0d5a..58599a9fa42f5 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -185,7 +185,7 @@ def test_string_functions(self): ] df = self.spark.createDataFrame([['nick']], schema=['name']) - self.assertRaisesRegexp( + self.assertRaisesRegex( TypeError, "must be the same type", lambda: df.select(col('name').substr(0, lit(1)))) @@ -321,16 +321,16 @@ def test_sort_with_nulls_order(self): df = self.spark.createDataFrame( [('Tom', 80), (None, 60), ('Alice', 50)], ["name", "height"]) - self.assertEquals( + self.assertEqual( df.select(df.name).orderBy(functions.asc_nulls_first('name')).collect(), [Row(name=None), Row(name=u'Alice'), Row(name=u'Tom')]) - self.assertEquals( + self.assertEqual( df.select(df.name).orderBy(functions.asc_nulls_last('name')).collect(), [Row(name=u'Alice'), Row(name=u'Tom'), Row(name=None)]) - self.assertEquals( + self.assertEqual( df.select(df.name).orderBy(functions.desc_nulls_first('name')).collect(), [Row(name=None), Row(name=u'Tom'), Row(name=u'Alice')]) - self.assertEquals( + self.assertEqual( df.select(df.name).orderBy(functions.desc_nulls_last('name')).collect(), [Row(name=u'Tom'), Row(name=u'Alice'), Row(name=None)]) @@ -354,7 +354,7 @@ def test_slice(self): df = self.spark.createDataFrame([([1, 2, 3],), ([4, 5],)], ['x']) - self.assertEquals( + self.assertEqual( df.select(slice(df.x, 2, 2).alias("sliced")).collect(), df.select(slice(df.x, lit(2), lit(2)).alias("sliced")).collect(), ) @@ -364,7 +364,7 @@ def test_array_repeat(self): df = self.spark.range(1) - self.assertEquals( + self.assertEqual( df.select(array_repeat("id", 3)).toDF("val").collect(), df.select(array_repeat("id", lit(3))).toDF("val").collect(), ) @@ -580,14 +580,14 @@ def test_datetime_functions(self): from datetime import date df = self.spark.range(1).selectExpr("'2017-01-22' as dateCol") parse_result = df.select(functions.to_date(functions.col("dateCol"))).first() - self.assertEquals(date(2017, 1, 22), parse_result['to_date(dateCol)']) + self.assertEqual(date(2017, 1, 22), parse_result['to_date(dateCol)']) def test_assert_true(self): from pyspark.sql.functions import assert_true df = self.spark.range(3) - self.assertEquals( + self.assertEqual( df.select(assert_true(df.id < 3)).toDF("val").collect(), [Row(val=None), Row(val=None), Row(val=None)], ) @@ -604,7 +604,7 @@ def test_assert_true(self): with self.assertRaises(TypeError) as cm: df.select(assert_true(df.id < 2, 5)) - self.assertEquals( + self.assertEqual( "errMsg should be a Column or a str, got ", str(cm.exception) ) @@ -626,7 +626,7 @@ def test_raise_error(self): with self.assertRaises(TypeError) as cm: df.select(raise_error(None)) - self.assertEquals( + self.assertEqual( "errMsg should be a Column or a str, got ", str(cm.exception) ) diff --git a/python/pyspark/sql/tests/test_pandas_cogrouped_map.py b/python/pyspark/sql/tests/test_pandas_cogrouped_map.py index 4afc1dfcc1c6e..3c016e04adf2e 100644 --- a/python/pyspark/sql/tests/test_pandas_cogrouped_map.py +++ b/python/pyspark/sql/tests/test_pandas_cogrouped_map.py @@ -25,7 +25,7 @@ if have_pandas: import pandas as pd - from pandas.util.testing import assert_frame_equal + from pandas.testing import assert_frame_equal if have_pyarrow: import pyarrow as pa # noqa: F401 @@ -135,8 +135,8 @@ def test_mixed_scalar_udfs_followed_by_cogrouby_apply(self): .applyInPandas(lambda x, y: pd.DataFrame([(x.sum().sum(), y.sum().sum())]), 'sum1 int, sum2 int').collect() - self.assertEquals(result[0]['sum1'], 165) - self.assertEquals(result[0]['sum2'], 165) + self.assertEqual(result[0]['sum1'], 165) + self.assertEqual(result[0]['sum2'], 165) def test_with_key_left(self): self._test_with_key(self.data1, self.data1, isLeft=True) @@ -174,7 +174,7 @@ def test_wrong_return_type(self): left = self.data1 right = self.data2 with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( NotImplementedError, 'Invalid return type.*ArrayType.*TimestampType'): left.groupby('id').cogroup(right.groupby('id')).applyInPandas( @@ -183,7 +183,7 @@ def test_wrong_return_type(self): def test_wrong_args(self): left = self.data1 right = self.data2 - with self.assertRaisesRegexp(ValueError, 'Invalid function'): + with self.assertRaisesRegex(ValueError, 'Invalid function'): left.groupby('id').cogroup(right.groupby('id')) \ .applyInPandas(lambda: 1, StructType([StructField("d", DoubleType())])) @@ -194,14 +194,14 @@ def test_case_insensitive_grouping_column(self): row = df1.groupby("ColUmn").cogroup( df1.groupby("COLUMN") ).applyInPandas(lambda r, l: r + l, "column long, value long").first() - self.assertEquals(row.asDict(), Row(column=2, value=2).asDict()) + self.assertEqual(row.asDict(), Row(column=2, value=2).asDict()) df2 = self.spark.createDataFrame([(1, 1)], ("column", "value")) row = df1.groupby("ColUmn").cogroup( df2.groupby("COLUMN") ).applyInPandas(lambda r, l: r + l, "column long, value long").first() - self.assertEquals(row.asDict(), Row(column=2, value=2).asDict()) + self.assertEqual(row.asDict(), Row(column=2, value=2).asDict()) @staticmethod def _test_with_key(left, right, isLeft): diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py index a639a8d51f55c..64803a6574675 100644 --- a/python/pyspark/sql/tests/test_pandas_grouped_map.py +++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py @@ -33,7 +33,7 @@ if have_pandas: import pandas as pd - from pandas.util.testing import assert_frame_equal + from pandas.testing import assert_frame_equal if have_pyarrow: import pyarrow as pa # noqa: F401 @@ -160,7 +160,7 @@ def test_array_type_correct(self): def test_register_grouped_map_udf(self): foo_udf = pandas_udf(lambda x: x, "id long", PandasUDFType.GROUPED_MAP) with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( ValueError, 'f.*SQL_BATCHED_UDF.*SQL_SCALAR_PANDAS_UDF.*SQL_GROUPED_AGG_PANDAS_UDF.*'): self.spark.catalog.registerFunction("foo_udf", foo_udf) @@ -244,7 +244,7 @@ def test_datatype_string(self): def test_wrong_return_type(self): with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( NotImplementedError, 'Invalid return type.*grouped map Pandas UDF.*ArrayType.*TimestampType'): pandas_udf( @@ -256,20 +256,20 @@ def test_wrong_args(self): df = self.data with QuietTest(self.sc): - with self.assertRaisesRegexp(ValueError, 'Invalid udf'): + with self.assertRaisesRegex(ValueError, 'Invalid udf'): df.groupby('id').apply(lambda x: x) - with self.assertRaisesRegexp(ValueError, 'Invalid udf'): + with self.assertRaisesRegex(ValueError, 'Invalid udf'): df.groupby('id').apply(udf(lambda x: x, DoubleType())) - with self.assertRaisesRegexp(ValueError, 'Invalid udf'): + with self.assertRaisesRegex(ValueError, 'Invalid udf'): df.groupby('id').apply(sum(df.v)) - with self.assertRaisesRegexp(ValueError, 'Invalid udf'): + with self.assertRaisesRegex(ValueError, 'Invalid udf'): df.groupby('id').apply(df.v + 1) - with self.assertRaisesRegexp(ValueError, 'Invalid function'): + with self.assertRaisesRegex(ValueError, 'Invalid function'): df.groupby('id').apply( pandas_udf(lambda: 1, StructType([StructField("d", DoubleType())]))) - with self.assertRaisesRegexp(ValueError, 'Invalid udf'): + with self.assertRaisesRegex(ValueError, 'Invalid udf'): df.groupby('id').apply(pandas_udf(lambda x, y: x, DoubleType())) - with self.assertRaisesRegexp(ValueError, 'Invalid udf.*GROUPED_MAP'): + with self.assertRaisesRegex(ValueError, 'Invalid udf.*GROUPED_MAP'): df.groupby('id').apply( pandas_udf(lambda x, y: x, DoubleType(), PandasUDFType.SCALAR)) @@ -284,7 +284,7 @@ def test_unsupported_types(self): for unsupported_type in unsupported_types: schema = StructType([StructField('id', LongType(), True), unsupported_type]) with QuietTest(self.sc): - with self.assertRaisesRegexp(NotImplementedError, common_err_msg): + with self.assertRaisesRegex(NotImplementedError, common_err_msg): pandas_udf(lambda x: x, schema, PandasUDFType.GROUPED_MAP) # Regression test for SPARK-23314 @@ -451,9 +451,9 @@ def invalid_positional_types(pdf): with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": False}): with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, "KeyError: 'id'"): + with self.assertRaisesRegex(Exception, "KeyError: 'id'"): grouped_df.apply(column_name_typo).collect() - with self.assertRaisesRegexp(Exception, "[D|d]ecimal.*got.*date"): + with self.assertRaisesRegex(Exception, "[D|d]ecimal.*got.*date"): grouped_df.apply(invalid_positional_types).collect() def test_positional_assignment_conf(self): @@ -482,7 +482,7 @@ def dummy_pandas_udf(df): # this was throwing an AnalysisException before SPARK-24208 res = df_with_pandas.alias('temp0').join(df_with_pandas.alias('temp1'), col('temp0.key') == col('temp1.key')) - self.assertEquals(res.count(), 5) + self.assertEqual(res.count(), 5) def test_mixed_scalar_udfs_followed_by_groupby_apply(self): df = self.spark.range(0, 10).toDF('v1') @@ -494,7 +494,7 @@ def test_mixed_scalar_udfs_followed_by_groupby_apply(self): 'sum int', PandasUDFType.GROUPED_MAP)) - self.assertEquals(result.collect()[0]['sum'], 165) + self.assertEqual(result.collect()[0]['sum'], 165) def test_grouped_with_empty_partition(self): data = [Row(id=1, x=2), Row(id=1, x=3), Row(id=2, x=4)] @@ -604,7 +604,7 @@ def my_pandas_udf(pdf): df = self.spark.createDataFrame([[1, 1]], ["column", "score"]) row = df.groupby('COLUMN').applyInPandas( my_pandas_udf, schema="column integer, score float").first() - self.assertEquals(row.asDict(), Row(column=1, score=0.5).asDict()) + self.assertEqual(row.asDict(), Row(column=1, score=0.5).asDict()) if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/test_pandas_map.py b/python/pyspark/sql/tests/test_pandas_map.py index 3ca437f75fc23..d53face702201 100644 --- a/python/pyspark/sql/tests/test_pandas_map.py +++ b/python/pyspark/sql/tests/test_pandas_map.py @@ -61,7 +61,7 @@ def func(iterator): df = self.spark.range(10) actual = df.mapInPandas(func, 'id long').collect() expected = df.collect() - self.assertEquals(actual, expected) + self.assertEqual(actual, expected) def test_multiple_columns(self): data = [(1, "foo"), (2, None), (3, "bar"), (4, "bar")] @@ -75,7 +75,7 @@ def func(iterator): actual = df.mapInPandas(func, df.schema).collect() expected = df.collect() - self.assertEquals(actual, expected) + self.assertEqual(actual, expected) def test_different_output_length(self): def func(iterator): @@ -84,7 +84,7 @@ def func(iterator): df = self.spark.range(10) actual = df.repartition(1).mapInPandas(func, 'a long').collect() - self.assertEquals(set((r.a for r in actual)), set(range(100))) + self.assertEqual(set((r.a for r in actual)), set(range(100))) def test_empty_iterator(self): def empty_iter(_): @@ -110,7 +110,7 @@ def func(iterator): df = self.spark.range(10) actual = df.mapInPandas(func, 'id long').mapInPandas(func, 'id long').collect() expected = df.collect() - self.assertEquals(actual, expected) + self.assertEqual(actual, expected) if __name__ == "__main__": diff --git a/python/pyspark/sql/tests/test_pandas_udf.py b/python/pyspark/sql/tests/test_pandas_udf.py index cc742fc4267cb..975eb4680dd04 100644 --- a/python/pyspark/sql/tests/test_pandas_udf.py +++ b/python/pyspark/sql/tests/test_pandas_udf.py @@ -114,31 +114,31 @@ def test_udf_wrong_arg(self): @pandas_udf('blah') def foo(x): return x - with self.assertRaisesRegexp(ValueError, 'Invalid return type.*None'): + with self.assertRaisesRegex(ValueError, 'Invalid return type.*None'): @pandas_udf(functionType=PandasUDFType.SCALAR) def foo(x): return x - with self.assertRaisesRegexp(ValueError, 'Invalid function'): + with self.assertRaisesRegex(ValueError, 'Invalid function'): @pandas_udf('double', 100) def foo(x): return x - with self.assertRaisesRegexp(ValueError, '0-arg pandas_udfs.*not.*supported'): + with self.assertRaisesRegex(ValueError, '0-arg pandas_udfs.*not.*supported'): pandas_udf(lambda: 1, LongType(), PandasUDFType.SCALAR) - with self.assertRaisesRegexp(ValueError, '0-arg pandas_udfs.*not.*supported'): + with self.assertRaisesRegex(ValueError, '0-arg pandas_udfs.*not.*supported'): @pandas_udf(LongType(), PandasUDFType.SCALAR) def zero_with_type(): return 1 - with self.assertRaisesRegexp(TypeError, 'Invalid return type'): + with self.assertRaisesRegex(TypeError, 'Invalid return type'): @pandas_udf(returnType=PandasUDFType.GROUPED_MAP) def foo(df): return df - with self.assertRaisesRegexp(TypeError, 'Invalid return type'): + with self.assertRaisesRegex(TypeError, 'Invalid return type'): @pandas_udf(returnType='double', functionType=PandasUDFType.GROUPED_MAP) def foo(df): return df - with self.assertRaisesRegexp(ValueError, 'Invalid function'): + with self.assertRaisesRegex(ValueError, 'Invalid function'): @pandas_udf(returnType='k int, v double', functionType=PandasUDFType.GROUPED_MAP) def foo(k, v, w): return k @@ -154,14 +154,14 @@ def foofoo(x, y): df = self.spark.range(0, 100) # plain udf (test for SPARK-23754) - self.assertRaisesRegexp( + self.assertRaisesRegex( PythonException, exc_message, df.withColumn('v', udf(foo)('id')).collect ) # pandas scalar udf - self.assertRaisesRegexp( + self.assertRaisesRegex( PythonException, exc_message, df.withColumn( @@ -170,7 +170,7 @@ def foofoo(x, y): ) # pandas grouped map - self.assertRaisesRegexp( + self.assertRaisesRegex( PythonException, exc_message, df.groupBy('id').apply( @@ -178,7 +178,7 @@ def foofoo(x, y): ).collect ) - self.assertRaisesRegexp( + self.assertRaisesRegex( PythonException, exc_message, df.groupBy('id').apply( @@ -187,7 +187,7 @@ def foofoo(x, y): ) # pandas grouped agg - self.assertRaisesRegexp( + self.assertRaisesRegex( PythonException, exc_message, df.groupBy('id').agg( @@ -210,8 +210,8 @@ def udf(column): # Since 0.11.0, PyArrow supports the feature to raise an error for unsafe cast. with self.sql_conf({ "spark.sql.execution.pandas.convertToArrowArraySafely": True}): - with self.assertRaisesRegexp(Exception, - "Exception thrown when converting pandas.Series"): + with self.assertRaisesRegex(Exception, + "Exception thrown when converting pandas.Series"): df.select(['A']).withColumn('udf', udf('A')).collect() # Disabling Arrow safe type check. @@ -231,8 +231,8 @@ def udf(column): # When enabling safe type check, Arrow 0.11.0+ disallows overflow cast. with self.sql_conf({ "spark.sql.execution.pandas.convertToArrowArraySafely": True}): - with self.assertRaisesRegexp(Exception, - "Exception thrown when converting pandas.Series"): + with self.assertRaisesRegex(Exception, + "Exception thrown when converting pandas.Series"): df.withColumn('udf', udf('id')).collect() # Disabling safe type check, let Arrow do the cast anyway. diff --git a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py index 2cbcf31f6e7b3..b49092ed70d04 100644 --- a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py +++ b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py @@ -30,7 +30,7 @@ if have_pandas: import pandas as pd - from pandas.util.testing import assert_frame_equal + from pandas.testing import assert_frame_equal @unittest.skipIf( @@ -145,20 +145,20 @@ def test_basic(self): def test_unsupported_types(self): with QuietTest(self.sc): - with self.assertRaisesRegexp(NotImplementedError, 'not supported'): + with self.assertRaisesRegex(NotImplementedError, 'not supported'): pandas_udf( lambda x: x, ArrayType(ArrayType(TimestampType())), PandasUDFType.GROUPED_AGG) with QuietTest(self.sc): - with self.assertRaisesRegexp(NotImplementedError, 'not supported'): + with self.assertRaisesRegex(NotImplementedError, 'not supported'): @pandas_udf('mean double, std double', PandasUDFType.GROUPED_AGG) def mean_and_std_udf(v): return v.mean(), v.std() with QuietTest(self.sc): - with self.assertRaisesRegexp(NotImplementedError, 'not supported'): + with self.assertRaisesRegex(NotImplementedError, 'not supported'): @pandas_udf(ArrayType(TimestampType()), PandasUDFType.GROUPED_AGG) def mean_and_std_udf(v): return {v.mean(): v.std()} @@ -428,7 +428,7 @@ def test_array_type(self): array_udf = pandas_udf(lambda x: [1.0, 2.0], 'array', PandasUDFType.GROUPED_AGG) result1 = df.groupby('id').agg(array_udf(df['v']).alias('v2')) - self.assertEquals(result1.first()['v2'], [1.0, 2.0]) + self.assertEqual(result1.first()['v2'], [1.0, 2.0]) def test_invalid_args(self): df = self.data @@ -436,19 +436,19 @@ def test_invalid_args(self): mean_udf = self.pandas_agg_mean_udf with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( AnalysisException, 'nor.*aggregate function'): df.groupby(df.id).agg(plus_one(df.v)).collect() with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( AnalysisException, 'aggregate function.*argument.*aggregate function'): df.groupby(df.id).agg(mean_udf(mean_udf(df.v))).collect() with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( AnalysisException, 'mixture.*aggregate function.*group aggregate pandas UDF'): df.groupby(df.id).agg(mean_udf(df.v), mean(df.v)).collect() diff --git a/python/pyspark/sql/tests/test_pandas_udf_scalar.py b/python/pyspark/sql/tests/test_pandas_udf_scalar.py index 5da5d043ceca4..2eb2dec00106e 100644 --- a/python/pyspark/sql/tests/test_pandas_udf_scalar.py +++ b/python/pyspark/sql/tests/test_pandas_udf_scalar.py @@ -133,7 +133,7 @@ def test_vectorized_udf_basic(self): long_f(col('long')), float_f(col('float')), double_f(col('double')), decimal_f('decimal'), bool_f(col('bool')), array_long_f('array_long')) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_register_nondeterministic_vectorized_udf_basic(self): random_pandas_udf = pandas_udf( @@ -169,7 +169,7 @@ def test_vectorized_udf_null_boolean(self): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: bool_f = pandas_udf(lambda x: x, BooleanType(), udf_type) res = df.select(bool_f(col('bool'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_byte(self): data = [(None,), (2,), (3,), (4,)] @@ -178,7 +178,7 @@ def test_vectorized_udf_null_byte(self): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: byte_f = pandas_udf(lambda x: x, ByteType(), udf_type) res = df.select(byte_f(col('byte'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_short(self): data = [(None,), (2,), (3,), (4,)] @@ -187,7 +187,7 @@ def test_vectorized_udf_null_short(self): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: short_f = pandas_udf(lambda x: x, ShortType(), udf_type) res = df.select(short_f(col('short'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_int(self): data = [(None,), (2,), (3,), (4,)] @@ -196,7 +196,7 @@ def test_vectorized_udf_null_int(self): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: int_f = pandas_udf(lambda x: x, IntegerType(), udf_type) res = df.select(int_f(col('int'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_long(self): data = [(None,), (2,), (3,), (4,)] @@ -205,7 +205,7 @@ def test_vectorized_udf_null_long(self): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: long_f = pandas_udf(lambda x: x, LongType(), udf_type) res = df.select(long_f(col('long'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_float(self): data = [(3.0,), (5.0,), (-1.0,), (None,)] @@ -214,7 +214,7 @@ def test_vectorized_udf_null_float(self): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: float_f = pandas_udf(lambda x: x, FloatType(), udf_type) res = df.select(float_f(col('float'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_double(self): data = [(3.0,), (5.0,), (-1.0,), (None,)] @@ -223,7 +223,7 @@ def test_vectorized_udf_null_double(self): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: double_f = pandas_udf(lambda x: x, DoubleType(), udf_type) res = df.select(double_f(col('double'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_decimal(self): data = [(Decimal(3.0),), (Decimal(5.0),), (Decimal(-1.0),), (None,)] @@ -232,7 +232,7 @@ def test_vectorized_udf_null_decimal(self): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: decimal_f = pandas_udf(lambda x: x, DecimalType(38, 18), udf_type) res = df.select(decimal_f(col('decimal'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_string(self): data = [("foo",), (None,), ("bar",), ("bar",)] @@ -241,7 +241,7 @@ def test_vectorized_udf_null_string(self): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: str_f = pandas_udf(lambda x: x, StringType(), udf_type) res = df.select(str_f(col('str'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_string_in_udf(self): df = self.spark.range(10) @@ -255,7 +255,7 @@ def iter_f(it): str_f = pandas_udf(f, StringType(), udf_type) actual = df.select(str_f(col('id'))) expected = df.select(col('id').cast('string')) - self.assertEquals(expected.collect(), actual.collect()) + self.assertEqual(expected.collect(), actual.collect()) def test_vectorized_udf_datatype_string(self): df = self.spark.range(10).select( @@ -279,7 +279,7 @@ def test_vectorized_udf_datatype_string(self): long_f(col('long')), float_f(col('float')), double_f(col('double')), decimal_f('decimal'), bool_f(col('bool'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_null_binary(self): data = [(bytearray(b"a"),), (None,), (bytearray(b"bb"),), (bytearray(b"ccc"),)] @@ -288,7 +288,7 @@ def test_vectorized_udf_null_binary(self): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: str_f = pandas_udf(lambda x: x, BinaryType(), udf_type) res = df.select(str_f(col('binary'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_array_type(self): data = [([1, 2],), ([3, 4],)] @@ -297,7 +297,7 @@ def test_vectorized_udf_array_type(self): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: array_f = pandas_udf(lambda x: x, ArrayType(IntegerType()), udf_type) result = df.select(array_f(col('array'))) - self.assertEquals(df.collect(), result.collect()) + self.assertEqual(df.collect(), result.collect()) def test_vectorized_udf_null_array(self): data = [([1, 2],), (None,), (None,), ([3, 4],), (None,)] @@ -306,7 +306,7 @@ def test_vectorized_udf_null_array(self): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: array_f = pandas_udf(lambda x: x, ArrayType(IntegerType()), udf_type) result = df.select(array_f(col('array'))) - self.assertEquals(df.collect(), result.collect()) + self.assertEqual(df.collect(), result.collect()) def test_vectorized_udf_struct_type(self): df = self.spark.range(10) @@ -375,7 +375,7 @@ def test_vectorized_udf_nested_struct(self): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( Exception, 'Invalid return type with scalar Pandas UDFs'): pandas_udf(lambda x: x, returnType=nested_type, functionType=udf_type) @@ -392,7 +392,7 @@ def test_vectorized_udf_map_type(self): else: map_f = pandas_udf(lambda x: x, MapType(StringType(), LongType()), udf_type) result = df.select(map_f(col('map'))) - self.assertEquals(df.collect(), result.collect()) + self.assertEqual(df.collect(), result.collect()) def test_vectorized_udf_complex(self): df = self.spark.range(10).select( @@ -422,7 +422,7 @@ def iter_mul(it): (iter_add, iter_power2, iter_mul)]: res = df.select(add(col('a'), col('b')), power2(col('a')), mul(col('b'), col('c'))) expected = df.select(expr('a + b'), expr('power(2, a)'), expr('b * c')) - self.assertEquals(expected.collect(), res.collect()) + self.assertEqual(expected.collect(), res.collect()) def test_vectorized_udf_exception(self): df = self.spark.range(10) @@ -435,14 +435,14 @@ def iter_raise_exception(it): for raise_exception in [scalar_raise_exception, iter_raise_exception]: with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, 'division( or modulo)? by zero'): + with self.assertRaisesRegex(Exception, 'division( or modulo)? by zero'): df.select(raise_exception(col('id'))).collect() def test_vectorized_udf_invalid_length(self): df = self.spark.range(10) raise_exception = pandas_udf(lambda _: pd.Series(1), LongType()) with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( Exception, 'Result vector from pandas_udf was not the required length'): df.select(raise_exception(col('id'))).collect() @@ -453,7 +453,7 @@ def iter_udf_wong_output_size(it): yield pd.Series(1) with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( Exception, "The length of output in Scalar iterator.*" "the length of output was 1"): @@ -469,7 +469,7 @@ def iter_udf_not_reading_all_input(it): with self.sql_conf({"spark.sql.execution.arrow.maxRecordsPerBatch": 3}): df1 = self.spark.range(10).repartition(1) with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( Exception, "pandas iterator UDF should exhaust"): df1.select(iter_udf_not_reading_all_input(col('id'))).collect() @@ -486,7 +486,7 @@ def test_vectorized_udf_chained(self): for f, g in [(scalar_f, scalar_g), (iter_f, iter_g)]: res = df.select(g(f(col('id')))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_chained_struct_type(self): df = self.spark.range(10) @@ -517,7 +517,7 @@ def iter_f(it): def test_vectorized_udf_wrong_return_type(self): with QuietTest(self.sc): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: - with self.assertRaisesRegexp( + with self.assertRaisesRegex( NotImplementedError, 'Invalid return type.*scalar Pandas UDF.*ArrayType.*TimestampType'): pandas_udf(lambda x: x, ArrayType(TimestampType()), udf_type) @@ -529,7 +529,7 @@ def test_vectorized_udf_return_scalar(self): PandasUDFType.SCALAR_ITER) for f in [scalar_f, iter_f]: with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, 'Return.*type.*Series'): + with self.assertRaisesRegex(Exception, 'Return.*type.*Series'): df.select(f(col('id'))).collect() def test_vectorized_udf_decorator(self): @@ -545,14 +545,14 @@ def iter_identity(x): for identity in [scalar_identity, iter_identity]: res = df.select(identity(col('id'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_empty_partition(self): df = self.spark.createDataFrame(self.sc.parallelize([Row(id=1)], 2)) for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: f = pandas_udf(lambda x: x, LongType(), udf_type) res = df.select(f(col('id'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_struct_with_empty_partition(self): df = self.spark.createDataFrame(self.sc.parallelize([Row(id=1)], 2))\ @@ -585,16 +585,16 @@ def iter_f(it): for f in [scalar_f, iter_f]: res = df.select(f(col('id'), col('id'))) - self.assertEquals(df.collect(), res.collect()) + self.assertEqual(df.collect(), res.collect()) def test_vectorized_udf_unsupported_types(self): with QuietTest(self.sc): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: - with self.assertRaisesRegexp( + with self.assertRaisesRegex( NotImplementedError, 'Invalid return type.*scalar Pandas UDF.*ArrayType.*TimestampType'): pandas_udf(lambda x: x, ArrayType(TimestampType()), udf_type) - with self.assertRaisesRegexp( + with self.assertRaisesRegex( NotImplementedError, 'Invalid return type.*scalar Pandas UDF.*ArrayType.StructType'): pandas_udf(lambda x: x, @@ -637,10 +637,10 @@ def iter_check_data(it): result = df.withColumn("check_data", check_data(col("idx"), col("date"), col("date_copy"))).collect() - self.assertEquals(len(data), len(result)) + self.assertEqual(len(data), len(result)) for i in range(len(result)): - self.assertEquals(data[i][1], result[i][1]) # "date" col - self.assertEquals(data[i][1], result[i][2]) # "date_copy" col + self.assertEqual(data[i][1], result[i][1]) # "date" col + self.assertEqual(data[i][1], result[i][2]) # "date_copy" col self.assertIsNone(result[i][3]) # "check_data" col def test_vectorized_udf_timestamps(self): @@ -686,10 +686,10 @@ def iter_check_data(it): result = df.withColumn("check_data", check_data(col("idx"), col("timestamp"), col("timestamp_copy"))).collect() # Check that collection values are correct - self.assertEquals(len(data), len(result)) + self.assertEqual(len(data), len(result)) for i in range(len(result)): - self.assertEquals(data[i][1], result[i][1]) # "timestamp" col - self.assertEquals(data[i][1], result[i][2]) # "timestamp_copy" col + self.assertEqual(data[i][1], result[i][1]) # "timestamp" col + self.assertEqual(data[i][1], result[i][2]) # "timestamp_copy" col self.assertIsNone(result[i][3]) # "check_data" col def test_vectorized_udf_return_timestamp_tz(self): @@ -713,7 +713,7 @@ def iter_gen_timestamps(it): i, ts = r ts_tz = pd.Timestamp(i, unit='D', tz='America/Los_Angeles').to_pydatetime() expected = spark_ts_t.fromInternal(spark_ts_t.toInternal(ts_tz)) - self.assertEquals(expected, ts) + self.assertEqual(expected, ts) def test_vectorized_udf_check_config(self): with self.sql_conf({"spark.sql.execution.arrow.maxRecordsPerBatch": 3}): @@ -799,9 +799,9 @@ def test_nondeterministic_vectorized_udf_in_aggregate(self): for random_udf in [self.nondeterministic_vectorized_udf, self.nondeterministic_vectorized_iter_udf]: with QuietTest(self.sc): - with self.assertRaisesRegexp(AnalysisException, 'nondeterministic'): + with self.assertRaisesRegex(AnalysisException, 'nondeterministic'): df.groupby(df.id).agg(sum(random_udf(df.id))).collect() - with self.assertRaisesRegexp(AnalysisException, 'nondeterministic'): + with self.assertRaisesRegex(AnalysisException, 'nondeterministic'): df.agg(sum(random_udf(df.id))).collect() def test_register_vectorized_udf_basic(self): @@ -825,8 +825,8 @@ def iter_original_add(it): res2 = self.spark.sql( "SELECT add1(t.a, t.b) FROM (SELECT id as a, id as b FROM range(10)) t") expected = df.select(expr('a + b')) - self.assertEquals(expected.collect(), res1.collect()) - self.assertEquals(expected.collect(), res2.collect()) + self.assertEqual(expected.collect(), res1.collect()) + self.assertEqual(expected.collect(), res2.collect()) def test_scalar_iter_udf_init(self): import numpy as np @@ -854,7 +854,7 @@ def test_close(batch_iter): finally: raise RuntimeError("reached finally block") with QuietTest(self.sc): - with self.assertRaisesRegexp(Exception, "reached finally block"): + with self.assertRaisesRegex(Exception, "reached finally block"): self.spark.range(1).select(test_close(col("id"))).collect() def test_scalar_iter_udf_close_early(self): @@ -905,7 +905,7 @@ def test_timestamp_dst(self): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: foo_udf = pandas_udf(lambda x: x, 'timestamp', udf_type) result = df.withColumn('time', foo_udf(df.time)) - self.assertEquals(df.collect(), result.collect()) + self.assertEqual(df.collect(), result.collect()) def test_udf_category_type(self): @@ -1003,11 +1003,11 @@ def f4_iter(it): df_chained_4 = df.withColumn('f4_f2_f1', f4(f2(f1(df['v'])))) df_chained_5 = df.withColumn('f4_f3_f1', f4(f3(f1(df['v'])))) - self.assertEquals(expected_chained_1, df_chained_1.collect()) - self.assertEquals(expected_chained_2, df_chained_2.collect()) - self.assertEquals(expected_chained_3, df_chained_3.collect()) - self.assertEquals(expected_chained_4, df_chained_4.collect()) - self.assertEquals(expected_chained_5, df_chained_5.collect()) + self.assertEqual(expected_chained_1, df_chained_1.collect()) + self.assertEqual(expected_chained_2, df_chained_2.collect()) + self.assertEqual(expected_chained_3, df_chained_3.collect()) + self.assertEqual(expected_chained_4, df_chained_4.collect()) + self.assertEqual(expected_chained_5, df_chained_5.collect()) # Test multiple mixed UDF expressions in a single projection df_multi_1 = df \ @@ -1045,8 +1045,8 @@ def f4_iter(it): .withColumn('f4_f3_f2', f4(f3(f2(col('v'))))) \ .withColumn('f4_f3_f2_f1', f4(f3(f2(f1(col('v')))))) - self.assertEquals(expected_multi, df_multi_1.collect()) - self.assertEquals(expected_multi, df_multi_2.collect()) + self.assertEqual(expected_multi, df_multi_1.collect()) + self.assertEqual(expected_multi, df_multi_2.collect()) def test_mixed_udf_and_sql(self): df = self.spark.range(0, 1).toDF('v') @@ -1107,7 +1107,7 @@ def f3i(it): .withColumn('f3_f1_f2', f3(f1(f2(df['v'])))) \ .withColumn('f3_f2_f1', f3(f2(f1(df['v'])))) - self.assertEquals(expected, df1.collect()) + self.assertEqual(expected, df1.collect()) # SPARK-24721 @unittest.skipIf(not test_compiled, test_not_compiled_message) # type: ignore @@ -1138,17 +1138,17 @@ def test_datasource_with_udf(self): for df in [filesource_df, datasource_df, datasource_v2_df]: result = df.withColumn('c', c1) expected = df.withColumn('c', lit(2)) - self.assertEquals(expected.collect(), result.collect()) + self.assertEqual(expected.collect(), result.collect()) for df in [filesource_df, datasource_df, datasource_v2_df]: result = df.withColumn('c', c2) expected = df.withColumn('c', col('i') + 1) - self.assertEquals(expected.collect(), result.collect()) + self.assertEqual(expected.collect(), result.collect()) for df in [filesource_df, datasource_df, datasource_v2_df]: for f in [f1, f2]: result = df.filter(f) - self.assertEquals(0, result.count()) + self.assertEqual(0, result.count()) finally: shutil.rmtree(path) diff --git a/python/pyspark/sql/tests/test_pandas_udf_typehints.py b/python/pyspark/sql/tests/test_pandas_udf_typehints.py index d9717da4d2fbd..e30f43181ae96 100644 --- a/python/pyspark/sql/tests/test_pandas_udf_typehints.py +++ b/python/pyspark/sql/tests/test_pandas_udf_typehints.py @@ -29,7 +29,7 @@ if have_pandas: import pandas as pd import numpy as np - from pandas.util.testing import assert_frame_equal + from pandas.testing import assert_frame_equal @unittest.skipIf( diff --git a/python/pyspark/sql/tests/test_pandas_udf_window.py b/python/pyspark/sql/tests/test_pandas_udf_window.py index 5ad2ecd8f85d4..d861bcce9e8b8 100644 --- a/python/pyspark/sql/tests/test_pandas_udf_window.py +++ b/python/pyspark/sql/tests/test_pandas_udf_window.py @@ -26,7 +26,7 @@ from pyspark.testing.utils import QuietTest if have_pandas: - from pandas.util.testing import assert_frame_equal + from pandas.testing import assert_frame_equal @unittest.skipIf( @@ -241,14 +241,14 @@ def test_array_type(self): array_udf = pandas_udf(lambda x: [1.0, 2.0], 'array', PandasUDFType.GROUPED_AGG) result1 = df.withColumn('v2', array_udf(df['v']).over(w)) - self.assertEquals(result1.first()['v2'], [1.0, 2.0]) + self.assertEqual(result1.first()['v2'], [1.0, 2.0]) def test_invalid_args(self): df = self.data w = self.unbounded_window with QuietTest(self.sc): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( AnalysisException, '.*not supported within a window function'): foo_udf = pandas_udf(lambda x: x, 'v double', PandasUDFType.GROUPED_MAP) diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index 6b5c1ad6c4e46..eb4caf05d1af0 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -180,7 +180,7 @@ def test_infer_schema_not_enough_names(self): self.assertEqual(df.columns, ['col1', '_2']) def test_infer_schema_fails(self): - with self.assertRaisesRegexp(TypeError, 'field a'): + with self.assertRaisesRegex(TypeError, 'field a'): self.spark.createDataFrame(self.spark.sparkContext.parallelize([[1, 1], ["x", 1]]), schema=["a", "b"], samplingRatio=0.99) @@ -578,18 +578,18 @@ def test_merge_type(self): ArrayType(LongType()), ArrayType(LongType()) ), ArrayType(LongType())) - with self.assertRaisesRegexp(TypeError, 'element in array'): + with self.assertRaisesRegex(TypeError, 'element in array'): _merge_type(ArrayType(LongType()), ArrayType(DoubleType())) self.assertEqual(_merge_type( MapType(StringType(), LongType()), MapType(StringType(), LongType()) ), MapType(StringType(), LongType())) - with self.assertRaisesRegexp(TypeError, 'key of map'): + with self.assertRaisesRegex(TypeError, 'key of map'): _merge_type( MapType(StringType(), LongType()), MapType(DoubleType(), LongType())) - with self.assertRaisesRegexp(TypeError, 'value of map'): + with self.assertRaisesRegex(TypeError, 'value of map'): _merge_type( MapType(StringType(), LongType()), MapType(StringType(), DoubleType())) @@ -598,7 +598,7 @@ def test_merge_type(self): StructType([StructField("f1", LongType()), StructField("f2", StringType())]), StructType([StructField("f1", LongType()), StructField("f2", StringType())]) ), StructType([StructField("f1", LongType()), StructField("f2", StringType())])) - with self.assertRaisesRegexp(TypeError, 'field f1'): + with self.assertRaisesRegex(TypeError, 'field f1'): _merge_type( StructType([StructField("f1", LongType()), StructField("f2", StringType())]), StructType([StructField("f1", DoubleType()), StructField("f2", StringType())])) @@ -607,7 +607,7 @@ def test_merge_type(self): StructType([StructField("f1", StructType([StructField("f2", LongType())]))]), StructType([StructField("f1", StructType([StructField("f2", LongType())]))]) ), StructType([StructField("f1", StructType([StructField("f2", LongType())]))])) - with self.assertRaisesRegexp(TypeError, 'field f2 in field f1'): + with self.assertRaisesRegex(TypeError, 'field f2 in field f1'): _merge_type( StructType([StructField("f1", StructType([StructField("f2", LongType())]))]), StructType([StructField("f1", StructType([StructField("f2", StringType())]))])) @@ -616,7 +616,7 @@ def test_merge_type(self): StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]), StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]) ), StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())])) - with self.assertRaisesRegexp(TypeError, 'element in array field f1'): + with self.assertRaisesRegex(TypeError, 'element in array field f1'): _merge_type( StructType([ StructField("f1", ArrayType(LongType())), @@ -635,7 +635,7 @@ def test_merge_type(self): ), StructType([ StructField("f1", MapType(StringType(), LongType())), StructField("f2", StringType())])) - with self.assertRaisesRegexp(TypeError, 'value of map field f1'): + with self.assertRaisesRegex(TypeError, 'value of map field f1'): _merge_type( StructType([ StructField("f1", MapType(StringType(), LongType())), @@ -648,7 +648,7 @@ def test_merge_type(self): StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]), StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]) ), StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))])) - with self.assertRaisesRegexp(TypeError, 'key of map element in array field f1'): + with self.assertRaisesRegex(TypeError, 'key of map element in array field f1'): _merge_type( StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]), StructType([StructField("f1", ArrayType(MapType(DoubleType(), LongType())))]) @@ -734,7 +734,7 @@ def assertCollectSuccess(typecode, value): unsupported_types = all_types - set(supported_types) # test unsupported types for t in unsupported_types: - with self.assertRaisesRegexp(TypeError, "infer the type of the field myarray"): + with self.assertRaisesRegex(TypeError, "infer the type of the field myarray"): a = array.array(t) self.spark.createDataFrame([Row(myarray=a)]).collect() @@ -789,13 +789,13 @@ def test_invalid_create_row(self): class DataTypeVerificationTests(unittest.TestCase): def test_verify_type_exception_msg(self): - self.assertRaisesRegexp( + self.assertRaisesRegex( ValueError, "test_name", lambda: _make_type_verifier(StringType(), nullable=False, name="test_name")(None)) schema = StructType([StructField('a', StructType([StructField('b', IntegerType())]))]) - self.assertRaisesRegexp( + self.assertRaisesRegex( TypeError, "field b in field a", lambda: _make_type_verifier(schema)([["data"]])) diff --git a/python/pyspark/sql/tests/test_udf.py b/python/pyspark/sql/tests/test_udf.py index 9a1c0edcce4ed..bfc55dff94540 100644 --- a/python/pyspark/sql/tests/test_udf.py +++ b/python/pyspark/sql/tests/test_udf.py @@ -98,7 +98,7 @@ def test_udf_registration_return_type_none(self): def test_udf_registration_return_type_not_none(self): with QuietTest(self.sc): - with self.assertRaisesRegexp(TypeError, "Invalid return type"): + with self.assertRaisesRegex(TypeError, "Invalid return type"): self.spark.catalog.registerFunction( "f", UserDefinedFunction(lambda x, y: len(x) + y, StringType()), StringType()) @@ -149,9 +149,9 @@ def test_nondeterministic_udf_in_aggregate(self): df = self.spark.range(10) with QuietTest(self.sc): - with self.assertRaisesRegexp(AnalysisException, "nondeterministic"): + with self.assertRaisesRegex(AnalysisException, "nondeterministic"): df.groupby('id').agg(sum(udf_random_col())).collect() - with self.assertRaisesRegexp(AnalysisException, "nondeterministic"): + with self.assertRaisesRegex(AnalysisException, "nondeterministic"): df.agg(sum(udf_random_col())).collect() def test_chained_udf(self): @@ -203,7 +203,7 @@ def test_udf_in_join_condition(self): # Cross join. df = left.join(right, f("a", "b")) with self.sql_conf({"spark.sql.crossJoin.enabled": False}): - with self.assertRaisesRegexp(AnalysisException, 'Detected implicit cartesian product'): + with self.assertRaisesRegex(AnalysisException, 'Detected implicit cartesian product'): df.collect() with self.sql_conf({"spark.sql.crossJoin.enabled": True}): self.assertEqual(df.collect(), [Row(a=1, b=1)]) @@ -238,7 +238,7 @@ def test_udf_not_supported_in_join_condition(self): f = udf(lambda a, b: a == b, BooleanType()) def runWithJoinType(join_type, type_string): - with self.assertRaisesRegexp( + with self.assertRaisesRegex( AnalysisException, 'Using PythonUDF.*%s is not supported.' % type_string): left.join(right, [f("a", "b"), left.a1 == right.b1], join_type).collect() @@ -385,18 +385,18 @@ def test_register_java_udaf(self): def test_non_existed_udf(self): spark = self.spark - self.assertRaisesRegexp(AnalysisException, "Can not load class non_existed_udf", - lambda: spark.udf.registerJavaFunction("udf1", "non_existed_udf")) + self.assertRaisesRegex(AnalysisException, "Can not load class non_existed_udf", + lambda: spark.udf.registerJavaFunction("udf1", "non_existed_udf")) # This is to check if a deprecated 'SQLContext.registerJavaFunction' can call its alias. sqlContext = spark._wrapped - self.assertRaisesRegexp(AnalysisException, "Can not load class non_existed_udf", - lambda: sqlContext.registerJavaFunction("udf1", "non_existed_udf")) + self.assertRaisesRegex(AnalysisException, "Can not load class non_existed_udf", + lambda: sqlContext.registerJavaFunction("udf1", "non_existed_udf")) def test_non_existed_udaf(self): spark = self.spark - self.assertRaisesRegexp(AnalysisException, "Can not load class non_existed_udaf", - lambda: spark.udf.registerJavaUDAF("udaf1", "non_existed_udaf")) + self.assertRaisesRegex(AnalysisException, "Can not load class non_existed_udaf", + lambda: spark.udf.registerJavaUDAF("udaf1", "non_existed_udaf")) def test_udf_with_input_file_name(self): from pyspark.sql.functions import input_file_name @@ -587,17 +587,17 @@ def test_datasource_with_udf(self): for df in [filesource_df, datasource_df, datasource_v2_df]: result = df.withColumn('c', c1) expected = df.withColumn('c', lit(2)) - self.assertEquals(expected.collect(), result.collect()) + self.assertEqual(expected.collect(), result.collect()) for df in [filesource_df, datasource_df, datasource_v2_df]: result = df.withColumn('c', c2) expected = df.withColumn('c', col('i') + 1) - self.assertEquals(expected.collect(), result.collect()) + self.assertEqual(expected.collect(), result.collect()) for df in [filesource_df, datasource_df, datasource_v2_df]: for f in [f1, f2]: result = df.filter(f) - self.assertEquals(0, result.count()) + self.assertEqual(0, result.count()) finally: shutil.rmtree(path) diff --git a/python/pyspark/sql/tests/test_utils.py b/python/pyspark/sql/tests/test_utils.py index b08e17208d8af..005f0e892b60f 100644 --- a/python/pyspark/sql/tests/test_utils.py +++ b/python/pyspark/sql/tests/test_utils.py @@ -31,23 +31,22 @@ def test_capture_user_friendly_exception(self): try: self.spark.sql("select `中文字段`") except AnalysisException as e: - self.assertRegexpMatches(str(e), "cannot resolve '`中文字段`'") + self.assertRegex(str(e), "cannot resolve '`中文字段`'") def test_capture_parse_exception(self): self.assertRaises(ParseException, lambda: self.spark.sql("abc")) def test_capture_illegalargument_exception(self): - self.assertRaisesRegexp(IllegalArgumentException, "Setting negative mapred.reduce.tasks", - lambda: self.spark.sql("SET mapred.reduce.tasks=-1")) + self.assertRaisesRegex(IllegalArgumentException, "Setting negative mapred.reduce.tasks", + lambda: self.spark.sql("SET mapred.reduce.tasks=-1")) df = self.spark.createDataFrame([(1, 2)], ["a", "b"]) - self.assertRaisesRegexp(IllegalArgumentException, "1024 is not in the permitted values", - lambda: df.select(sha2(df.a, 1024)).collect()) + self.assertRaisesRegex(IllegalArgumentException, "1024 is not in the permitted values", + lambda: df.select(sha2(df.a, 1024)).collect()) try: df.select(sha2(df.a, 1024)).collect() except IllegalArgumentException as e: - self.assertRegexpMatches(e.desc, "1024 is not in the permitted values") - self.assertRegexpMatches(e.stackTrace, - "org.apache.spark.sql.functions") + self.assertRegex(e.desc, "1024 is not in the permitted values") + self.assertRegex(e.stackTrace, "org.apache.spark.sql.functions") if __name__ == "__main__": diff --git a/python/pyspark/tests/test_profiler.py b/python/pyspark/tests/test_profiler.py index de72a547b0844..e621321283dab 100644 --- a/python/pyspark/tests/test_profiler.py +++ b/python/pyspark/tests/test_profiler.py @@ -85,11 +85,11 @@ class ProfilerTests2(unittest.TestCase): def test_profiler_disabled(self): sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false")) try: - self.assertRaisesRegexp( + self.assertRaisesRegex( RuntimeError, "'spark.python.profile' configuration must be set", lambda: sc.show_profiles()) - self.assertRaisesRegexp( + self.assertRaisesRegex( RuntimeError, "'spark.python.profile' configuration must be set", lambda: sc.dump_profiles("/tmp/abc")) diff --git a/python/pyspark/tests/test_rdd.py b/python/pyspark/tests/test_rdd.py index 47b8f10a5b05e..b17c039889a71 100644 --- a/python/pyspark/tests/test_rdd.py +++ b/python/pyspark/tests/test_rdd.py @@ -733,25 +733,25 @@ def stopit(*x): keyed_rdd = self.sc.parallelize((x % 2, x) for x in range(10)) msg = "Caught StopIteration thrown from user's code; failing the task" - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.map(stopit).collect) - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.filter(stopit).collect) - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.foreach, stopit) - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.reduce, stopit) - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.fold, 0, stopit) - self.assertRaisesRegexp(Py4JJavaError, msg, seq_rdd.foreach, stopit) - self.assertRaisesRegexp(Py4JJavaError, msg, - seq_rdd.cartesian(seq_rdd).flatMap(stopit).collect) + self.assertRaisesRegex(Py4JJavaError, msg, seq_rdd.map(stopit).collect) + self.assertRaisesRegex(Py4JJavaError, msg, seq_rdd.filter(stopit).collect) + self.assertRaisesRegex(Py4JJavaError, msg, seq_rdd.foreach, stopit) + self.assertRaisesRegex(Py4JJavaError, msg, seq_rdd.reduce, stopit) + self.assertRaisesRegex(Py4JJavaError, msg, seq_rdd.fold, 0, stopit) + self.assertRaisesRegex(Py4JJavaError, msg, seq_rdd.foreach, stopit) + self.assertRaisesRegex(Py4JJavaError, msg, + seq_rdd.cartesian(seq_rdd).flatMap(stopit).collect) # these methods call the user function both in the driver and in the executor # the exception raised is different according to where the StopIteration happens # RuntimeError is raised if in the driver # Py4JJavaError is raised if in the executor (wraps the RuntimeError raised in the worker) - self.assertRaisesRegexp((Py4JJavaError, RuntimeError), msg, - keyed_rdd.reduceByKeyLocally, stopit) - self.assertRaisesRegexp((Py4JJavaError, RuntimeError), msg, - seq_rdd.aggregate, 0, stopit, lambda *x: 1) - self.assertRaisesRegexp((Py4JJavaError, RuntimeError), msg, - seq_rdd.aggregate, 0, lambda *x: 1, stopit) + self.assertRaisesRegex((Py4JJavaError, RuntimeError), msg, + keyed_rdd.reduceByKeyLocally, stopit) + self.assertRaisesRegex((Py4JJavaError, RuntimeError), msg, + seq_rdd.aggregate, 0, stopit, lambda *x: 1) + self.assertRaisesRegex((Py4JJavaError, RuntimeError), msg, + seq_rdd.aggregate, 0, lambda *x: 1, stopit) def test_overwritten_global_func(self): # Regression test for SPARK-27000 @@ -768,7 +768,7 @@ def fail(_): rdd = self.sc.range(10).map(fail) - with self.assertRaisesRegexp(Exception, "local iterator error"): + with self.assertRaisesRegex(Exception, "local iterator error"): for _ in rdd.toLocalIterator(): pass diff --git a/python/pyspark/tests/test_worker.py b/python/pyspark/tests/test_worker.py index d7a4b84e8dc41..51ebee4de7cec 100644 --- a/python/pyspark/tests/test_worker.py +++ b/python/pyspark/tests/test_worker.py @@ -165,7 +165,7 @@ def f(): self.sc.parallelize([1]).map(lambda x: f()).count() except Py4JJavaError as e: - self.assertRegexpMatches(str(e), "exception with 中") + self.assertRegex(str(e), "exception with 中") class WorkerReuseTest(PySparkTestCase): From 80161238fe9393aabd5fcd56752ff1e43f6989b1 Mon Sep 17 00:00:00 2001 From: Weichen Xu Date: Tue, 1 Dec 2020 09:36:42 +0800 Subject: [PATCH 057/150] [SPARK-33592] Fix: Pyspark ML Validator params in estimatorParamMaps may be lost after saving and reloading ### What changes were proposed in this pull request? Fix: Pyspark ML Validator params in estimatorParamMaps may be lost after saving and reloading When saving validator estimatorParamMaps, will check all nested stages in tuned estimator to get correct param parent. Two typical cases to manually test: ~~~python tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression() pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100]) \ .addGrid(lr.maxIter, [100, 200]) \ .build() tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) # check `loadedTvs.getEstimatorParamMaps()` restored correctly. ~~~ ~~~python lr = LogisticRegression() ova = OneVsRest(classifier=lr) grid = ParamGridBuilder().addGrid(lr.maxIter, [100, 200]).build() evaluator = MulticlassClassificationEvaluator() tvs = TrainValidationSplit(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator) tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) # check `loadedTvs.getEstimatorParamMaps()` restored correctly. ~~~ ### Why are the changes needed? Bug fix. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Unit test. Closes #30539 from WeichenXu123/fix_tuning_param_maps_io. Authored-by: Weichen Xu Signed-off-by: Ruifeng Zheng --- dev/sparktestsupport/modules.py | 1 + python/pyspark/ml/classification.py | 46 +------------ python/pyspark/ml/param/__init__.py | 6 ++ python/pyspark/ml/pipeline.py | 53 +-------------- python/pyspark/ml/tests/test_tuning.py | 47 +++++++++++-- python/pyspark/ml/tests/test_util.py | 84 +++++++++++++++++++++++ python/pyspark/ml/tuning.py | 94 ++++++++++++++++++++++++-- python/pyspark/ml/util.py | 38 +++++++++++ python/pyspark/ml/util.pyi | 6 ++ 9 files changed, 268 insertions(+), 107 deletions(-) create mode 100644 python/pyspark/ml/tests/test_util.py diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 868e4a5d23ed7..5d8b714711774 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -564,6 +564,7 @@ def __hash__(self): "pyspark.ml.tests.test_stat", "pyspark.ml.tests.test_training_summary", "pyspark.ml.tests.test_tuning", + "pyspark.ml.tests.test_util", "pyspark.ml.tests.test_wrapper", ], excluded_python_implementations=[ diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 50882fc895d6c..763038ede876a 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -36,7 +36,7 @@ from pyspark.ml.util import JavaMLWritable, JavaMLReadable, HasTrainingSummary from pyspark.ml.wrapper import JavaParams, \ JavaPredictor, JavaPredictionModel, JavaWrapper -from pyspark.ml.common import inherit_doc, _java2py, _py2java +from pyspark.ml.common import inherit_doc from pyspark.ml.linalg import Vectors from pyspark.sql import DataFrame from pyspark.sql.functions import udf, when @@ -2991,50 +2991,6 @@ def _to_java(self): _java_obj.setRawPredictionCol(self.getRawPredictionCol()) return _java_obj - def _make_java_param_pair(self, param, value): - """ - Makes a Java param pair. - """ - sc = SparkContext._active_spark_context - param = self._resolveParam(param) - _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.classification.OneVsRest", - self.uid) - java_param = _java_obj.getParam(param.name) - if isinstance(value, JavaParams): - # used in the case of an estimator having another estimator as a parameter - # the reason why this is not in _py2java in common.py is that importing - # Estimator and Model in common.py results in a circular import with inherit_doc - java_value = value._to_java() - else: - java_value = _py2java(sc, value) - return java_param.w(java_value) - - def _transfer_param_map_to_java(self, pyParamMap): - """ - Transforms a Python ParamMap into a Java ParamMap. - """ - paramMap = JavaWrapper._new_java_obj("org.apache.spark.ml.param.ParamMap") - for param in self.params: - if param in pyParamMap: - pair = self._make_java_param_pair(param, pyParamMap[param]) - paramMap.put([pair]) - return paramMap - - def _transfer_param_map_from_java(self, javaParamMap): - """ - Transforms a Java ParamMap into a Python ParamMap. - """ - sc = SparkContext._active_spark_context - paramMap = dict() - for pair in javaParamMap.toList(): - param = pair.param() - if self.hasParam(str(param.name())): - if param.name() == "classifier": - paramMap[self.getParam(param.name())] = JavaParams._from_java(pair.value()) - else: - paramMap[self.getParam(param.name())] = _java2py(sc, pair.value()) - return paramMap - class OneVsRestModel(Model, _OneVsRestParams, JavaMLReadable, JavaMLWritable): """ diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index f2381a4c42698..3eab6607aa7ee 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -437,6 +437,12 @@ def _resolveParam(self, param): else: raise ValueError("Cannot resolve %r as a param." % param) + def _testOwnParam(self, param_parent, param_name): + """ + Test the ownership. Return True or False + """ + return self.uid == param_parent and self.hasParam(param_name) + @staticmethod def _dummy(): """ diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py index a6471a8dd1fe5..b0aa735709e8d 100644 --- a/python/pyspark/ml/pipeline.py +++ b/python/pyspark/ml/pipeline.py @@ -21,8 +21,8 @@ from pyspark.ml.param import Param, Params from pyspark.ml.util import MLReadable, MLWritable, JavaMLWriter, JavaMLReader, \ DefaultParamsReader, DefaultParamsWriter, MLWriter, MLReader, JavaMLWritable -from pyspark.ml.wrapper import JavaParams, JavaWrapper -from pyspark.ml.common import inherit_doc, _java2py, _py2java +from pyspark.ml.wrapper import JavaParams +from pyspark.ml.common import inherit_doc @inherit_doc @@ -190,55 +190,6 @@ def _to_java(self): return _java_obj - def _make_java_param_pair(self, param, value): - """ - Makes a Java param pair. - """ - sc = SparkContext._active_spark_context - param = self._resolveParam(param) - java_param = sc._jvm.org.apache.spark.ml.param.Param(param.parent, param.name, param.doc) - if isinstance(value, Params) and hasattr(value, "_to_java"): - # Convert JavaEstimator/JavaTransformer object or Estimator/Transformer object which - # implements `_to_java` method (such as OneVsRest, Pipeline object) to java object. - # used in the case of an estimator having another estimator as a parameter - # the reason why this is not in _py2java in common.py is that importing - # Estimator and Model in common.py results in a circular import with inherit_doc - java_value = value._to_java() - else: - java_value = _py2java(sc, value) - return java_param.w(java_value) - - def _transfer_param_map_to_java(self, pyParamMap): - """ - Transforms a Python ParamMap into a Java ParamMap. - """ - paramMap = JavaWrapper._new_java_obj("org.apache.spark.ml.param.ParamMap") - for param in self.params: - if param in pyParamMap: - pair = self._make_java_param_pair(param, pyParamMap[param]) - paramMap.put([pair]) - return paramMap - - def _transfer_param_map_from_java(self, javaParamMap): - """ - Transforms a Java ParamMap into a Python ParamMap. - """ - sc = SparkContext._active_spark_context - paramMap = dict() - for pair in javaParamMap.toList(): - param = pair.param() - if self.hasParam(str(param.name())): - java_obj = pair.value() - if sc._jvm.Class.forName("org.apache.spark.ml.PipelineStage").isInstance(java_obj): - # Note: JavaParams._from_java support both JavaEstimator/JavaTransformer class - # and Estimator/Transformer class which implements `_from_java` static method - # (such as OneVsRest, Pipeline class). - py_obj = JavaParams._from_java(java_obj) - else: - py_obj = _java2py(sc, java_obj) - paramMap[self.getParam(param.name())] = py_obj - return paramMap - @inherit_doc class PipelineWriter(MLWriter): diff --git a/python/pyspark/ml/tests/test_tuning.py b/python/pyspark/ml/tests/test_tuning.py index ced32c07f245f..ebd7457e4d30a 100644 --- a/python/pyspark/ml/tests/test_tuning.py +++ b/python/pyspark/ml/tests/test_tuning.py @@ -73,7 +73,21 @@ def test_addGrid(self): .build()) -class CrossValidatorTests(SparkSessionTestCase): +class ValidatorTestUtilsMixin: + def assert_param_maps_equal(self, paramMaps1, paramMaps2): + self.assertEqual(len(paramMaps1), len(paramMaps2)) + for paramMap1, paramMap2 in zip(paramMaps1, paramMaps2): + self.assertEqual(set(paramMap1.keys()), set(paramMap2.keys())) + for param in paramMap1.keys(): + v1 = paramMap1[param] + v2 = paramMap2[param] + if isinstance(v1, Params): + self.assertEqual(v1.uid, v2.uid) + else: + self.assertEqual(v1, v2) + + +class CrossValidatorTests(SparkSessionTestCase, ValidatorTestUtilsMixin): def test_copy(self): dataset = self.spark.createDataFrame([ @@ -256,7 +270,7 @@ def test_save_load_simple_estimator(self): loadedCV = CrossValidator.load(cvPath) self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) - self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps()) + self.assert_param_maps_equal(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps()) # test save/load of CrossValidatorModel cvModelPath = temp_path + "/cvModel" @@ -351,6 +365,7 @@ def test_save_load_nested_estimator(self): cvPath = temp_path + "/cv" cv.save(cvPath) loadedCV = CrossValidator.load(cvPath) + self.assert_param_maps_equal(loadedCV.getEstimatorParamMaps(), grid) self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) @@ -367,6 +382,7 @@ def test_save_load_nested_estimator(self): cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) + self.assert_param_maps_equal(loadedModel.getEstimatorParamMaps(), grid) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid) def test_save_load_pipeline_estimator(self): @@ -401,6 +417,11 @@ def test_save_load_pipeline_estimator(self): estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), numFolds=2) # use 3+ folds in practice + cvPath = temp_path + "/cv" + crossval.save(cvPath) + loadedCV = CrossValidator.load(cvPath) + self.assert_param_maps_equal(loadedCV.getEstimatorParamMaps(), paramGrid) + self.assertEqual(loadedCV.getEstimator().uid, crossval.getEstimator().uid) # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(training) @@ -421,6 +442,11 @@ def test_save_load_pipeline_estimator(self): estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), numFolds=2) # use 3+ folds in practice + cv2Path = temp_path + "/cv2" + crossval2.save(cv2Path) + loadedCV2 = CrossValidator.load(cv2Path) + self.assert_param_maps_equal(loadedCV2.getEstimatorParamMaps(), paramGrid) + self.assertEqual(loadedCV2.getEstimator().uid, crossval2.getEstimator().uid) # Run cross-validation, and choose the best set of parameters. cvModel2 = crossval2.fit(training) @@ -511,7 +537,7 @@ def test_invalid_user_specified_folds(self): cv.fit(dataset_with_folds) -class TrainValidationSplitTests(SparkSessionTestCase): +class TrainValidationSplitTests(SparkSessionTestCase, ValidatorTestUtilsMixin): def test_fit_minimize_metric(self): dataset = self.spark.createDataFrame([ @@ -632,7 +658,8 @@ def test_save_load_simple_estimator(self): loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) - self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps()) + self.assert_param_maps_equal( + loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps()) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) @@ -713,6 +740,7 @@ def test_save_load_nested_estimator(self): tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) + self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(), grid) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) @@ -728,6 +756,7 @@ def test_save_load_nested_estimator(self): tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) + self.assert_param_maps_equal(loadedModel.getEstimatorParamMaps(), grid) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid) def test_save_load_pipeline_estimator(self): @@ -761,6 +790,11 @@ def test_save_load_pipeline_estimator(self): tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) + tvsPath = temp_path + "/tvs" + tvs.save(tvsPath) + loadedTvs = TrainValidationSplit.load(tvsPath) + self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(), paramGrid) + self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) # Run train validation split, and choose the best set of parameters. tvsModel = tvs.fit(training) @@ -780,6 +814,11 @@ def test_save_load_pipeline_estimator(self): tvs2 = TrainValidationSplit(estimator=nested_pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) + tvs2Path = temp_path + "/tvs2" + tvs2.save(tvs2Path) + loadedTvs2 = TrainValidationSplit.load(tvs2Path) + self.assert_param_maps_equal(loadedTvs2.getEstimatorParamMaps(), paramGrid) + self.assertEqual(loadedTvs2.getEstimator().uid, tvs2.getEstimator().uid) # Run train validation split, and choose the best set of parameters. tvsModel2 = tvs2.fit(training) diff --git a/python/pyspark/ml/tests/test_util.py b/python/pyspark/ml/tests/test_util.py new file mode 100644 index 0000000000000..498a649e480a8 --- /dev/null +++ b/python/pyspark/ml/tests/test_util.py @@ -0,0 +1,84 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +from pyspark.ml import Pipeline +from pyspark.ml.classification import LogisticRegression, OneVsRest +from pyspark.ml.feature import VectorAssembler +from pyspark.ml.linalg import Vectors +from pyspark.ml.util import MetaAlgorithmReadWrite +from pyspark.testing.mlutils import SparkSessionTestCase + + +class MetaAlgorithmReadWriteTests(SparkSessionTestCase): + + def test_getAllNestedStages(self): + def _check_uid_set_equal(stages, expected_stages): + uids = set(map(lambda x: x.uid, stages)) + expected_uids = set(map(lambda x: x.uid, expected_stages)) + self.assertEqual(uids, expected_uids) + + df1 = self.spark.createDataFrame([ + (Vectors.dense([1., 2.]), 1.0), + (Vectors.dense([-1., -2.]), 0.0), + ], ['features', 'label']) + df2 = self.spark.createDataFrame([ + (1., 2., 1.0), + (1., 2., 0.0), + ], ['a', 'b', 'label']) + vs = VectorAssembler(inputCols=['a', 'b'], outputCol='features') + lr = LogisticRegression() + pipeline = Pipeline(stages=[vs, lr]) + pipelineModel = pipeline.fit(df2) + ova = OneVsRest(classifier=lr) + ovaModel = ova.fit(df1) + + ova_pipeline = Pipeline(stages=[vs, ova]) + nested_pipeline = Pipeline(stages=[ova_pipeline]) + + _check_uid_set_equal( + MetaAlgorithmReadWrite.getAllNestedStages(pipeline), + [pipeline, vs, lr] + ) + _check_uid_set_equal( + MetaAlgorithmReadWrite.getAllNestedStages(pipelineModel), + [pipelineModel] + pipelineModel.stages + ) + _check_uid_set_equal( + MetaAlgorithmReadWrite.getAllNestedStages(ova), + [ova, lr] + ) + _check_uid_set_equal( + MetaAlgorithmReadWrite.getAllNestedStages(ovaModel), + [ovaModel, lr] + ovaModel.models + ) + _check_uid_set_equal( + MetaAlgorithmReadWrite.getAllNestedStages(nested_pipeline), + [nested_pipeline, ova_pipeline, vs, ova, lr] + ) + + +if __name__ == "__main__": + from pyspark.ml.tests.test_util import * # noqa: F401 + + try: + import xmlrunner # type: ignore[import] + testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 6f4ad99484546..2b5a9857b0f18 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -26,8 +26,9 @@ from pyspark.ml.common import _py2java, _java2py from pyspark.ml.param import Params, Param, TypeConverters from pyspark.ml.param.shared import HasCollectSubModels, HasParallelism, HasSeed -from pyspark.ml.util import MLReadable, MLWritable, JavaMLWriter, JavaMLReader -from pyspark.ml.wrapper import JavaParams +from pyspark.ml.util import MLReadable, MLWritable, JavaMLWriter, JavaMLReader, \ + MetaAlgorithmReadWrite +from pyspark.ml.wrapper import JavaParams, JavaEstimator, JavaWrapper from pyspark.sql.functions import col, lit, rand, UserDefinedFunction from pyspark.sql.types import BooleanType @@ -64,6 +65,10 @@ def _parallelFitTasks(est, train, eva, validation, epm, collectSubModel): def singleTask(): index, model = next(modelIter) + # TODO: duplicate evaluator to take extra params from input + # Note: Supporting tuning params in evaluator need update method + # `MetaAlgorithmReadWrite.getAllNestedStages`, make it return + # all nested stages and evaluators metric = eva.evaluate(model.transform(validation, epm[index])) return index, metric, model if collectSubModel else None @@ -186,8 +191,16 @@ def _from_java_impl(cls, java_stage): # Load information from java_stage to the instance. estimator = JavaParams._from_java(java_stage.getEstimator()) evaluator = JavaParams._from_java(java_stage.getEvaluator()) - epms = [estimator._transfer_param_map_from_java(epm) - for epm in java_stage.getEstimatorParamMaps()] + if isinstance(estimator, JavaEstimator): + epms = [estimator._transfer_param_map_from_java(epm) + for epm in java_stage.getEstimatorParamMaps()] + elif MetaAlgorithmReadWrite.isMetaEstimator(estimator): + # Meta estimator such as Pipeline, OneVsRest + epms = _ValidatorSharedReadWrite.meta_estimator_transfer_param_maps_from_java( + estimator, java_stage.getEstimatorParamMaps()) + else: + raise ValueError('Unsupported estimator used in tuning: ' + str(estimator)) + return estimator, epms, evaluator def _to_java_impl(self): @@ -198,15 +211,82 @@ def _to_java_impl(self): gateway = SparkContext._gateway cls = SparkContext._jvm.org.apache.spark.ml.param.ParamMap - java_epms = gateway.new_array(cls, len(self.getEstimatorParamMaps())) - for idx, epm in enumerate(self.getEstimatorParamMaps()): - java_epms[idx] = self.getEstimator()._transfer_param_map_to_java(epm) + estimator = self.getEstimator() + if isinstance(estimator, JavaEstimator): + java_epms = gateway.new_array(cls, len(self.getEstimatorParamMaps())) + for idx, epm in enumerate(self.getEstimatorParamMaps()): + java_epms[idx] = self.getEstimator()._transfer_param_map_to_java(epm) + elif MetaAlgorithmReadWrite.isMetaEstimator(estimator): + # Meta estimator such as Pipeline, OneVsRest + java_epms = _ValidatorSharedReadWrite.meta_estimator_transfer_param_maps_to_java( + estimator, self.getEstimatorParamMaps()) + else: + raise ValueError('Unsupported estimator used in tuning: ' + str(estimator)) java_estimator = self.getEstimator()._to_java() java_evaluator = self.getEvaluator()._to_java() return java_estimator, java_epms, java_evaluator +class _ValidatorSharedReadWrite: + @staticmethod + def meta_estimator_transfer_param_maps_to_java(pyEstimator, pyParamMaps): + pyStages = MetaAlgorithmReadWrite.getAllNestedStages(pyEstimator) + stagePairs = list(map(lambda stage: (stage, stage._to_java()), pyStages)) + sc = SparkContext._active_spark_context + + paramMapCls = SparkContext._jvm.org.apache.spark.ml.param.ParamMap + javaParamMaps = SparkContext._gateway.new_array(paramMapCls, len(pyParamMaps)) + + for idx, pyParamMap in enumerate(pyParamMaps): + javaParamMap = JavaWrapper._new_java_obj("org.apache.spark.ml.param.ParamMap") + for pyParam, pyValue in pyParamMap.items(): + javaParam = None + for pyStage, javaStage in stagePairs: + if pyStage._testOwnParam(pyParam.parent, pyParam.name): + javaParam = javaStage.getParam(pyParam.name) + break + if javaParam is None: + raise ValueError('Resolve param in estimatorParamMaps failed: ' + str(pyParam)) + if isinstance(pyValue, Params) and hasattr(pyValue, "_to_java"): + javaValue = pyValue._to_java() + else: + javaValue = _py2java(sc, pyValue) + pair = javaParam.w(javaValue) + javaParamMap.put([pair]) + javaParamMaps[idx] = javaParamMap + return javaParamMaps + + @staticmethod + def meta_estimator_transfer_param_maps_from_java(pyEstimator, javaParamMaps): + pyStages = MetaAlgorithmReadWrite.getAllNestedStages(pyEstimator) + stagePairs = list(map(lambda stage: (stage, stage._to_java()), pyStages)) + sc = SparkContext._active_spark_context + pyParamMaps = [] + for javaParamMap in javaParamMaps: + pyParamMap = dict() + for javaPair in javaParamMap.toList(): + javaParam = javaPair.param() + pyParam = None + for pyStage, javaStage in stagePairs: + if pyStage._testOwnParam(javaParam.parent(), javaParam.name()): + pyParam = pyStage.getParam(javaParam.name()) + if pyParam is None: + raise ValueError('Resolve param in estimatorParamMaps failed: ' + + javaParam.parent() + '.' + javaParam.name()) + javaValue = javaPair.value() + if sc._jvm.Class.forName("org.apache.spark.ml.PipelineStage").isInstance(javaValue): + # Note: JavaParams._from_java support both JavaEstimator/JavaTransformer class + # and Estimator/Transformer class which implements `_from_java` static method + # (such as OneVsRest, Pipeline class). + pyValue = JavaParams._from_java(javaValue) + else: + pyValue = _java2py(sc, javaValue) + pyParamMap[pyParam] = pyValue + pyParamMaps.append(pyParamMap) + return pyParamMaps + + class _CrossValidatorParams(_ValidatorParams): """ Params for :py:class:`CrossValidator` and :py:class:`CrossValidatorModel`. diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py index a7b5a79d75f5f..a34bfb53482a0 100644 --- a/python/pyspark/ml/util.py +++ b/python/pyspark/ml/util.py @@ -592,3 +592,41 @@ def summary(self): no summary exists. """ return (self._call_java("summary")) + + +class MetaAlgorithmReadWrite: + + @staticmethod + def isMetaEstimator(pyInstance): + from pyspark.ml import Estimator, Pipeline + from pyspark.ml.tuning import _ValidatorParams + from pyspark.ml.classification import OneVsRest + return isinstance(pyInstance, Pipeline) or isinstance(pyInstance, OneVsRest) or \ + (isinstance(pyInstance, Estimator) and isinstance(pyInstance, _ValidatorParams)) + + @staticmethod + def getAllNestedStages(pyInstance): + from pyspark.ml import Pipeline, PipelineModel + from pyspark.ml.tuning import _ValidatorParams + from pyspark.ml.classification import OneVsRest, OneVsRestModel + + # TODO: We need to handle `RFormulaModel.pipelineModel` here after Pyspark RFormulaModel + # support pipelineModel property. + if isinstance(pyInstance, Pipeline): + pySubStages = pyInstance.getStages() + elif isinstance(pyInstance, PipelineModel): + pySubStages = pyInstance.stages + elif isinstance(pyInstance, _ValidatorParams): + raise ValueError('PySpark does not support nested validator.') + elif isinstance(pyInstance, OneVsRest): + pySubStages = [pyInstance.getClassifier()] + elif isinstance(pyInstance, OneVsRestModel): + pySubStages = [pyInstance.getClassifier()] + pyInstance.models + else: + pySubStages = [] + + nestedStages = [] + for pySubStage in pySubStages: + nestedStages.extend(MetaAlgorithmReadWrite.getAllNestedStages(pySubStage)) + + return [pyInstance] + nestedStages diff --git a/python/pyspark/ml/util.pyi b/python/pyspark/ml/util.pyi index d0781b2e26ed5..e2496e181f14f 100644 --- a/python/pyspark/ml/util.pyi +++ b/python/pyspark/ml/util.pyi @@ -126,3 +126,9 @@ class HasTrainingSummary(Generic[S]): def hasSummary(self) -> bool: ... @property def summary(self) -> S: ... + +class MetaAlgorithmReadWrite: + @staticmethod + def isMetaEstimator(pyInstance: Any) -> bool: ... + @staticmethod + def getAllNestedStages(pyInstance: Any) -> list: ... From c50fcac00ea9b86aa6f6edb738e53ba476261027 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Tue, 1 Dec 2020 11:45:32 +0900 Subject: [PATCH 058/150] [SPARK-33607][SS][WEBUI] Input Rate timeline/histogram aren't rendered if built with Scala 2.13 ### What changes were proposed in this pull request? This PR fixes an issue that the histogram and timeline aren't rendered in the `Streaming Query Statistics` page if we built Spark with Scala 2.13. ![before-fix-the-issue](https://user-images.githubusercontent.com/4736016/100612855-f543d700-3356-11eb-90d9-ede57b8b3f4f.png) ![NaN_Error](https://user-images.githubusercontent.com/4736016/100612879-00970280-3357-11eb-97cf-43978bbe2d3a.png) The reason is [`maxRecordRate` can be `NaN`](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala#L371) for Scala 2.13. The `NaN` is the result of [`query.recentProgress.map(_.inputRowsPerSecond).max`](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala#L372) when the first element of `query.recentProgress.map(_.inputRowsPerSecond)` is `NaN`. Actually, the comparison logic for `Double` type was changed in Scala 2.13. https://github.com/scala/bug/issues/12107 https://github.com/scala/scala/pull/6410 So this issue happens as of Scala 2.13. The root cause of the `NaN` is [here](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala#L164). This `NaN` seems to be an initial value of `inputTimeSec` so I think `Double.PositiveInfinity` is suitable rather than `NaN` and this change can resolve this issue. ### Why are the changes needed? To make sure we can use the histogram/timeline with Scala 2.13. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? First, I built with the following commands. ``` $ /dev/change-scala-version.sh 2.13 $ build/sbt -Phive -Phive-thriftserver -Pscala-2.13 package ``` Then, ran the following query (this is brought from #30427 ). ``` import org.apache.spark.sql.streaming.Trigger val query = spark .readStream .format("rate") .option("rowsPerSecond", 1000) .option("rampUpTime", "10s") .load() .selectExpr("*", "CAST(CAST(timestamp AS BIGINT) - CAST((RAND() * 100000) AS BIGINT) AS TIMESTAMP) AS tsMod") .selectExpr("tsMod", "mod(value, 100) as mod", "value") .withWatermark("tsMod", "10 seconds") .groupBy(window($"tsMod", "1 minute", "10 seconds"), $"mod") .agg(max("value").as("max_value"), min("value").as("min_value"), avg("value").as("avg_value")) .writeStream .format("console") .trigger(Trigger.ProcessingTime("5 seconds")) .outputMode("append") .start() ``` Finally, I confirmed that the timeline and histogram are rendered. ![after-fix-the-issue](https://user-images.githubusercontent.com/4736016/100612736-c9285600-3356-11eb-856d-7e53cc656c36.png) ``` Closes #30546 from sarutak/ss-nan. Authored-by: Kousuke Saruta Signed-off-by: Jungtaek Lim (HeartSaVioR) --- .../apache/spark/sql/execution/streaming/ProgressReporter.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala index fe3f0e95b383c..57cb551bba17d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala @@ -161,7 +161,7 @@ trait ProgressReporter extends Logging { val inputTimeSec = if (lastTriggerStartTimestamp >= 0) { (currentTriggerStartTimestamp - lastTriggerStartTimestamp).toDouble / MILLIS_PER_SECOND } else { - Double.NaN + Double.PositiveInfinity } logDebug(s"Execution stats: $executionStats") From 2af2da5a4b1f5dbf0b55afd0b2514a52f03ffa94 Mon Sep 17 00:00:00 2001 From: "Jungtaek Lim (HeartSaVioR)" Date: Tue, 1 Dec 2020 13:11:14 +0900 Subject: [PATCH 059/150] [SPARK-30900][SS] FileStreamSource: Avoid reading compact metadata log twice if the query restarts from compact batch ### What changes were proposed in this pull request? This patch addresses the case where compact metadata file is read twice in FileStreamSource during restarting query. When restarting the query, there is a case which the query starts from compaction batch, and the batch has source metadata file to read. One case is that the previous query succeeded to read from inputs, but not finalized the batch for various reasons. The patch finds the latest compaction batch when restoring from metadata log, and put entries for the batch into the file entry cache which would avoid reading compact batch file twice. FileStreamSourceLog doesn't know about offset / commit metadata in checkpoint so doesn't know which exactly batch to start from, but in practice, only couple of latest batches are candidates to be started from when restarting query. This patch leverages the fact to skip calculation if possible. ### Why are the changes needed? Spark incurs unnecessary cost on reading the compact metadata file twice on some case, which may not be ignorable when the query has been processed huge number of files so far. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? New UT. Closes #27649 from HeartSaVioR/SPARK-30900. Authored-by: Jungtaek Lim (HeartSaVioR) Signed-off-by: Jungtaek Lim (HeartSaVioR) --- .../streaming/FileStreamSource.scala | 2 +- .../streaming/FileStreamSourceLog.scala | 27 ++++++++ .../sql/streaming/FileStreamSourceSuite.scala | 64 +++++++++++++++++++ 3 files changed, 92 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala index 42401fe069551..e53c5a9c4024e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala @@ -104,7 +104,7 @@ class FileStreamSource( // Visible for testing and debugging in production. val seenFiles = new SeenFilesMap(maxFileAgeMs, fileNameOnly) - metadataLog.allFiles().foreach { entry => + metadataLog.restore().foreach { entry => seenFiles.add(entry.path, entry.timestamp) } seenFiles.purge() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala index 88a2326c9a02c..5fe9a39c91e0b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala @@ -36,6 +36,7 @@ class FileStreamSourceLog( extends CompactibleFileStreamLog[FileEntry](metadataLogVersion, sparkSession, path) { import CompactibleFileStreamLog._ + import FileStreamSourceLog._ // Configurations about metadata compaction protected override val defaultCompactInterval: Int = @@ -118,8 +119,34 @@ class FileStreamSourceLog( } batches } + + def restore(): Array[FileEntry] = { + val files = allFiles() + + // When restarting the query, there is a case which the query starts from compaction batch, + // and the batch has source metadata file to read. One case is that the previous query + // succeeded to read from inputs, but not finalized the batch for various reasons. + // The below code finds the latest compaction batch, and put entries for the batch into the + // file entry cache which would avoid reading compact batch file twice. + // It doesn't know about offset / commit metadata in checkpoint so doesn't know which exactly + // batch to start from, but in practice, only couple of latest batches are candidates to + // be started. We leverage the fact to skip calculation if possible. + files.lastOption.foreach { lastEntry => + val latestBatchId = lastEntry.batchId + val latestCompactedBatchId = getAllValidBatches(latestBatchId, compactInterval)(0) + if ((latestBatchId - latestCompactedBatchId) < PREV_NUM_BATCHES_TO_READ_IN_RESTORE) { + val logsForLatestCompactedBatch = files.filter { entry => + entry.batchId == latestCompactedBatchId + } + fileEntryCache.put(latestCompactedBatchId, logsForLatestCompactedBatch) + } + } + + files + } } object FileStreamSourceLog { val VERSION = 1 + val PREV_NUM_BATCHES_TO_READ_IN_RESTORE = 2 } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala index 718095003b096..3c74e316f260e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala @@ -1376,6 +1376,70 @@ class FileStreamSourceSuite extends FileStreamSourceTest { } } + test("restore from file stream source log") { + def createEntries(batchId: Long, count: Int): Array[FileEntry] = { + (1 to count).map { idx => + FileEntry(s"path_${batchId}_$idx", 10000 * batchId + count, batchId) + }.toArray + } + + withSQLConf(SQLConf.FILE_SOURCE_LOG_COMPACT_INTERVAL.key -> "5") { + def verifyBatchAvailabilityInCache( + fileEntryCache: java.util.LinkedHashMap[Long, Array[FileEntry]], + expectNotAvailable: Seq[Int], + expectAvailable: Seq[Int]): Unit = { + expectNotAvailable.foreach { batchId => + assert(!fileEntryCache.containsKey(batchId.toLong)) + } + expectAvailable.foreach { batchId => + assert(fileEntryCache.containsKey(batchId.toLong)) + } + } + withTempDir { chk => + val _fileEntryCache = PrivateMethod[java.util.LinkedHashMap[Long, Array[FileEntry]]]( + Symbol("fileEntryCache")) + + val metadata = new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, + chk.getCanonicalPath) + val fileEntryCache = metadata invokePrivate _fileEntryCache() + + (0 to 4).foreach { batchId => + metadata.add(batchId, createEntries(batchId, 100)) + } + val allFiles = metadata.allFiles() + + // batch 4 is a compact batch which logs would be cached in fileEntryCache + verifyBatchAvailabilityInCache(fileEntryCache, Seq(0, 1, 2, 3), Seq(4)) + + val metadata2 = new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, + chk.getCanonicalPath) + val fileEntryCache2 = metadata2 invokePrivate _fileEntryCache() + + // allFiles() doesn't restore the logs for the latest compact batch into file entry cache + assert(metadata2.allFiles() === allFiles) + verifyBatchAvailabilityInCache(fileEntryCache2, Seq(0, 1, 2, 3, 4), Seq.empty) + + // restore() will restore the logs for the latest compact batch into file entry cache + assert(metadata2.restore() === allFiles) + verifyBatchAvailabilityInCache(fileEntryCache2, Seq(0, 1, 2, 3), Seq(4)) + + (5 to 5 + FileStreamSourceLog.PREV_NUM_BATCHES_TO_READ_IN_RESTORE).foreach { batchId => + metadata2.add(batchId, createEntries(batchId, 100)) + } + + val metadata3 = new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, + chk.getCanonicalPath) + val fileEntryCache3 = metadata3 invokePrivate _fileEntryCache() + + // restore() will not restore the logs for the latest compact batch into file entry cache + // if the latest batch is too far from latest compact batch, because it's unlikely Spark + // will request the batch for the start point. + assert(metadata3.restore() === metadata2.allFiles()) + verifyBatchAvailabilityInCache(fileEntryCache3, Seq(0, 1, 2, 3, 4), Seq.empty) + } + } + } + test("get arbitrary batch from FileStreamSource") { withTempDirs { case (src, tmp) => withSQLConf( From 1a042cc414c0c720535798b9a1197fe8885d6f6e Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Tue, 1 Dec 2020 13:43:02 +0900 Subject: [PATCH 060/150] [SPARK-33530][CORE] Support --archives and spark.archives option natively ### What changes were proposed in this pull request? TL;DR: - This PR completes the support of archives in Spark itself instead of Yarn-only - It makes `--archives` option work in other cluster modes too and adds `spark.archives` configuration. - After this PR, PySpark users can leverage Conda to ship Python packages together as below: ```python conda create -y -n pyspark_env -c conda-forge pyarrow==2.0.0 pandas==1.1.4 conda-pack==0.5.0 conda activate pyspark_env conda pack -f -o pyspark_env.tar.gz PYSPARK_DRIVER_PYTHON=python PYSPARK_PYTHON=./environment/bin/python pyspark --archives pyspark_env.tar.gz#environment ``` - Issue a warning that undocumented and hidden behavior of partial archive handling in `spark.files` / `SparkContext.addFile` will be deprecated, and users can use `spark.archives` and `SparkContext.addArchive`. This PR proposes to add Spark's native `--archives` in Spark submit, and `spark.archives` configuration. Currently, both are supported only in Yarn mode: ```bash ./bin/spark-submit --help ``` ``` Options: ... Spark on YARN only: --queue QUEUE_NAME The YARN queue to submit to (Default: "default"). --archives ARCHIVES Comma separated list of archives to be extracted into the working directory of each executor. ``` This `archives` feature is useful often when you have to ship a directory and unpack into executors. One example is native libraries to use e.g. JNI. Another example is to ship Python packages together by Conda environment. Especially for Conda, PySpark currently does not have a nice way to ship a package that works in general, please see also https://hyukjin-spark.readthedocs.io/en/stable/user_guide/python_packaging.html#using-zipped-virtual-environment (PySpark new documentation demo for 3.1.0). The neatest way is arguably to use Conda environment by shipping zipped Conda environment but this is currently dependent on this archive feature. NOTE that we are able to use `spark.files` by relying on its undocumented behaviour that untars `tar.gz` but I don't think we should document such ways and promote people to more rely on it. Also, note that this PR does not target to add the feature parity of `spark.files.overwrite`, `spark.files.useFetchCache`, etc. yet. I documented that this is an experimental feature as well. ### Why are the changes needed? To complete the feature parity, and to provide a better support of shipping Python libraries together with Conda env. ### Does this PR introduce _any_ user-facing change? Yes, this makes `--archives` works in Spark instead of Yarn-only, and adds a new configuration `spark.archives`. ### How was this patch tested? I added unittests. Also, manually tested in standalone cluster, local-cluster, and local modes. Closes #30486 from HyukjinKwon/native-archive. Authored-by: HyukjinKwon Signed-off-by: HyukjinKwon --- .../scala/org/apache/spark/SparkContext.scala | 89 ++++++++++++++++--- .../scala/org/apache/spark/SparkEnv.scala | 5 +- .../org/apache/spark/deploy/SparkSubmit.scala | 3 + .../spark/deploy/SparkSubmitArguments.scala | 5 +- .../org/apache/spark/executor/Executor.scala | 50 ++++++++--- .../spark/internal/config/package.scala | 10 +++ .../spark/scheduler/TaskDescription.scala | 9 +- .../spark/scheduler/TaskSetManager.scala | 2 + .../scala/org/apache/spark/util/Utils.scala | 52 +++++++++-- .../org/apache/spark/SparkContextSuite.scala | 79 ++++++++++++++++ .../spark/deploy/SparkSubmitSuite.scala | 37 ++++++++ .../deploy/rest/SubmitRestProtocolSuite.scala | 3 + .../CoarseGrainedExecutorBackendSuite.scala | 2 +- .../apache/spark/executor/ExecutorSuite.scala | 1 + .../CoarseGrainedSchedulerBackendSuite.scala | 3 +- .../scheduler/EventLoggingListenerSuite.scala | 3 +- .../scheduler/TaskDescriptionSuite.scala | 6 ++ docs/configuration.md | 11 +++ project/MimaExcludes.scala | 1 + .../source/user_guide/python_packaging.rst | 27 +++--- ...esosFineGrainedSchedulerBackendSuite.scala | 2 + 21 files changed, 347 insertions(+), 53 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index b953592fa04dc..86f1d745d91d4 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -22,6 +22,7 @@ import java.net.URI import java.util.{Arrays, Locale, Properties, ServiceLoader, UUID} import java.util.concurrent.{ConcurrentHashMap, ConcurrentMap} import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger, AtomicReference} +import javax.ws.rs.core.UriBuilder import scala.collection.JavaConverters._ import scala.collection.Map @@ -39,7 +40,7 @@ import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf, Sequence import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat, Job => NewHadoopJob} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFormat} -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{DeveloperApi, Experimental} import org.apache.spark.broadcast.Broadcast import org.apache.spark.deploy.{LocalSparkCluster, SparkHadoopUtil} import org.apache.spark.executor.{Executor, ExecutorMetrics, ExecutorMetricsSource} @@ -221,6 +222,7 @@ class SparkContext(config: SparkConf) extends Logging { private var _listenerBusStarted: Boolean = false private var _jars: Seq[String] = _ private var _files: Seq[String] = _ + private var _archives: Seq[String] = _ private var _shutdownHookRef: AnyRef = _ private var _statusStore: AppStatusStore = _ private var _heartbeater: Heartbeater = _ @@ -246,6 +248,7 @@ class SparkContext(config: SparkConf) extends Logging { def jars: Seq[String] = _jars def files: Seq[String] = _files + def archives: Seq[String] = _archives def master: String = _conf.get("spark.master") def deployMode: String = _conf.get(SUBMIT_DEPLOY_MODE) def appName: String = _conf.get("spark.app.name") @@ -278,6 +281,7 @@ class SparkContext(config: SparkConf) extends Logging { // Used to store a URL for each static file/jar together with the file's local timestamp private[spark] val addedFiles = new ConcurrentHashMap[String, Long]().asScala + private[spark] val addedArchives = new ConcurrentHashMap[String, Long]().asScala private[spark] val addedJars = new ConcurrentHashMap[String, Long]().asScala // Keeps track of all persisted RDDs @@ -422,6 +426,7 @@ class SparkContext(config: SparkConf) extends Logging { _jars = Utils.getUserJars(_conf) _files = _conf.getOption(FILES.key).map(_.split(",")).map(_.filter(_.nonEmpty)) .toSeq.flatten + _archives = _conf.getOption(ARCHIVES.key).map(Utils.stringToSeq).toSeq.flatten _eventLogDir = if (isEventLogEnabled) { @@ -506,6 +511,13 @@ class SparkContext(config: SparkConf) extends Logging { } } + if (archives != null) { + archives.foreach(file => addFile(file, false, true, isArchive = true)) + if (addedArchives.nonEmpty) { + _conf.set("spark.app.initial.archive.urls", addedArchives.keys.toSeq.mkString(",")) + } + } + _executorMemory = _conf.getOption(EXECUTOR_MEMORY.key) .orElse(Option(System.getenv("SPARK_EXECUTOR_MEMORY"))) .orElse(Option(System.getenv("SPARK_MEM")) @@ -1521,6 +1533,36 @@ class SparkContext(config: SparkConf) extends Logging { */ def listFiles(): Seq[String] = addedFiles.keySet.toSeq + /** + * :: Experimental :: + * Add an archive to be downloaded and unpacked with this Spark job on every node. + * + * If an archive is added during execution, it will not be available until the next TaskSet + * starts. + * + * @param path can be either a local file, a file in HDFS (or other Hadoop-supported + * filesystems), or an HTTP, HTTPS or FTP URI. To access the file in Spark jobs, + * use `SparkFiles.get(paths-to-files)` to find its download/unpacked location. + * The given path should be one of .zip, .tar, .tar.gz, .tgz and .jar. + * + * @note A path can be added only once. Subsequent additions of the same path are ignored. + * + * @since 3.1.0 + */ + @Experimental + def addArchive(path: String): Unit = { + addFile(path, false, false, isArchive = true) + } + + /** + * :: Experimental :: + * Returns a list of archive paths that are added to resources. + * + * @since 3.1.0 + */ + @Experimental + def listArchives(): Seq[String] = addedArchives.keySet.toSeq + /** * Add a file to be downloaded with this Spark job on every node. * @@ -1538,8 +1580,14 @@ class SparkContext(config: SparkConf) extends Logging { addFile(path, recursive, false) } - private def addFile(path: String, recursive: Boolean, addedOnSubmit: Boolean): Unit = { - val uri = new Path(path).toUri + private def addFile( + path: String, recursive: Boolean, addedOnSubmit: Boolean, isArchive: Boolean = false + ): Unit = { + val uri = if (!isArchive) { + new Path(path).toUri + } else { + Utils.resolveURI(path) + } val schemeCorrectedURI = uri.getScheme match { case null => new File(path).getCanonicalFile.toURI case "local" => @@ -1551,7 +1599,7 @@ class SparkContext(config: SparkConf) extends Logging { val hadoopPath = new Path(schemeCorrectedURI) val scheme = schemeCorrectedURI.getScheme - if (!Array("http", "https", "ftp").contains(scheme)) { + if (!Array("http", "https", "ftp").contains(scheme) && !isArchive) { val fs = hadoopPath.getFileSystem(hadoopConfiguration) val isDir = fs.getFileStatus(hadoopPath).isDirectory if (!isLocal && scheme == "file" && isDir) { @@ -1569,21 +1617,39 @@ class SparkContext(config: SparkConf) extends Logging { val key = if (!isLocal && scheme == "file") { env.rpcEnv.fileServer.addFile(new File(uri.getPath)) + } else if (uri.getScheme == null) { + schemeCorrectedURI.toString + } else if (isArchive) { + uri.toString } else { - if (uri.getScheme == null) { - schemeCorrectedURI.toString - } else { - path - } + path } + val timestamp = if (addedOnSubmit) startTime else System.currentTimeMillis - if (addedFiles.putIfAbsent(key, timestamp).isEmpty) { + if (!isArchive && addedFiles.putIfAbsent(key, timestamp).isEmpty) { logInfo(s"Added file $path at $key with timestamp $timestamp") // Fetch the file locally so that closures which are run on the driver can still use the // SparkFiles API to access files. Utils.fetchFile(uri.toString, new File(SparkFiles.getRootDirectory()), conf, env.securityManager, hadoopConfiguration, timestamp, useCache = false) postEnvironmentUpdate() + } else if ( + isArchive && + addedArchives.putIfAbsent( + UriBuilder.fromUri(new URI(key)).fragment(uri.getFragment).build().toString, + timestamp).isEmpty) { + logInfo(s"Added archive $path at $key with timestamp $timestamp") + val uriToDownload = UriBuilder.fromUri(new URI(key)).fragment(null).build() + val source = Utils.fetchFile(uriToDownload.toString, Utils.createTempDir(), conf, + env.securityManager, hadoopConfiguration, timestamp, useCache = false, shouldUntar = false) + val dest = new File( + SparkFiles.getRootDirectory(), + if (uri.getFragment != null) uri.getFragment else source.getName) + logInfo( + s"Unpacking an archive $path from ${source.getAbsolutePath} to ${dest.getAbsolutePath}") + Utils.deleteRecursively(dest) + Utils.unpack(source, dest) + postEnvironmentUpdate() } else { logWarning(s"The path $path has been added already. Overwriting of added paths " + "is not supported in the current version.") @@ -2495,8 +2561,9 @@ class SparkContext(config: SparkConf) extends Logging { val schedulingMode = getSchedulingMode.toString val addedJarPaths = addedJars.keys.toSeq val addedFilePaths = addedFiles.keys.toSeq + val addedArchivePaths = addedArchives.keys.toSeq val environmentDetails = SparkEnv.environmentDetails(conf, hadoopConfiguration, - schedulingMode, addedJarPaths, addedFilePaths) + schedulingMode, addedJarPaths, addedFilePaths, addedArchivePaths) val environmentUpdate = SparkListenerEnvironmentUpdate(environmentDetails) listenerBus.post(environmentUpdate) } diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala index d543359f4dedf..9fc60ac3990fc 100644 --- a/core/src/main/scala/org/apache/spark/SparkEnv.scala +++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala @@ -454,7 +454,8 @@ object SparkEnv extends Logging { hadoopConf: Configuration, schedulingMode: String, addedJars: Seq[String], - addedFiles: Seq[String]): Map[String, Seq[(String, String)]] = { + addedFiles: Seq[String], + addedArchives: Seq[String]): Map[String, Seq[(String, String)]] = { import Properties._ val jvmInformation = Seq( @@ -484,7 +485,7 @@ object SparkEnv extends Logging { .split(File.pathSeparator) .filterNot(_.isEmpty) .map((_, "System Classpath")) - val addedJarsAndFiles = (addedJars ++ addedFiles).map((_, "Added By User")) + val addedJarsAndFiles = (addedJars ++ addedFiles ++ addedArchives).map((_, "Added By User")) val classPaths = (addedJarsAndFiles ++ classPathEntries).sorted // Add Hadoop properties, it will not ignore configs including in Spark. Some spark diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 4aa393c514af6..a344bce7a0f3c 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -607,6 +607,8 @@ private[spark] class SparkSubmit extends Logging { confKey = CORES_MAX.key), OptionAssigner(args.files, LOCAL | STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES, confKey = FILES.key), + OptionAssigner(args.archives, LOCAL | STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES, + confKey = ARCHIVES.key), OptionAssigner(args.jars, LOCAL, CLIENT, confKey = JARS.key), OptionAssigner(args.jars, STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES, confKey = JARS.key), @@ -796,6 +798,7 @@ private[spark] class SparkSubmit extends Logging { val pathConfigs = Seq( JARS.key, FILES.key, + ARCHIVES.key, "spark.yarn.dist.files", "spark.yarn.dist.archives", "spark.yarn.dist.jars") diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index 3090a3b10a97c..9da1a73bba692 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -183,6 +183,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S name = Option(name).orElse(sparkProperties.get("spark.app.name")).orNull jars = Option(jars).orElse(sparkProperties.get(config.JARS.key)).orNull files = Option(files).orElse(sparkProperties.get(config.FILES.key)).orNull + archives = Option(archives).orElse(sparkProperties.get(config.ARCHIVES.key)).orNull pyFiles = Option(pyFiles).orElse(sparkProperties.get(config.SUBMIT_PYTHON_FILES.key)).orNull ivyRepoPath = sparkProperties.get("spark.jars.ivy").orNull ivySettingsPath = sparkProperties.get("spark.jars.ivySettings") @@ -512,6 +513,8 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S | --files FILES Comma-separated list of files to be placed in the working | directory of each executor. File paths of these files | in executors can be accessed via SparkFiles.get(fileName). + | --archives ARCHIVES Comma-separated list of archives to be extracted into the + | working directory of each executor. | | --conf, -c PROP=VALUE Arbitrary Spark configuration property. | --properties-file FILE Path to a file from which to load extra properties. If not @@ -562,8 +565,6 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S | | Spark on YARN only: | --queue QUEUE_NAME The YARN queue to submit to (Default: "default"). - | --archives ARCHIVES Comma separated list of archives to be extracted into the - | working directory of each executor. """.stripMargin ) diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index c81ac778a32d1..e7f1b8f3cf17a 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -26,6 +26,7 @@ import java.util.{Locale, Properties} import java.util.concurrent._ import java.util.concurrent.atomic.AtomicBoolean import javax.annotation.concurrent.GuardedBy +import javax.ws.rs.core.UriBuilder import scala.collection.JavaConverters._ import scala.collection.immutable @@ -78,6 +79,7 @@ private[spark] class Executor( // Each map holds the master's timestamp for the version of that file or JAR we got. private val currentFiles: HashMap[String, Long] = new HashMap[String, Long]() private val currentJars: HashMap[String, Long] = new HashMap[String, Long]() + private val currentArchives: HashMap[String, Long] = new HashMap[String, Long]() private val EMPTY_BYTE_BUFFER = ByteBuffer.wrap(new Array[Byte](0)) @@ -232,16 +234,17 @@ private[spark] class Executor( private val appStartTime = conf.getLong("spark.app.startTime", 0) // To allow users to distribute plugins and their required files - // specified by --jars and --files on application submission, those jars/files should be - // downloaded and added to the class loader via updateDependencies. - // This should be done before plugin initialization below + // specified by --jars, --files and --archives on application submission, those + // jars/files/archives should be downloaded and added to the class loader via + // updateDependencies. This should be done before plugin initialization below // because executors search plugins from the class loader and initialize them. - private val Seq(initialUserJars, initialUserFiles) = Seq("jar", "file").map { key => - conf.getOption(s"spark.app.initial.$key.urls").map { urls => - Map(urls.split(",").map(url => (url, appStartTime)): _*) - }.getOrElse(Map.empty) - } - updateDependencies(initialUserFiles, initialUserJars) + private val Seq(initialUserJars, initialUserFiles, initialUserArchives) = + Seq("jar", "file", "archive").map { key => + conf.getOption(s"spark.app.initial.$key.urls").map { urls => + Map(urls.split(",").map(url => (url, appStartTime)): _*) + }.getOrElse(Map.empty) + } + updateDependencies(initialUserFiles, initialUserJars, initialUserArchives) // Plugins need to load using a class loader that includes the executor's user classpath. // Plugins also needs to be initialized after the heartbeater started @@ -449,7 +452,8 @@ private[spark] class Executor( // requires access to properties contained within (e.g. for access control). Executor.taskDeserializationProps.set(taskDescription.properties) - updateDependencies(taskDescription.addedFiles, taskDescription.addedJars) + updateDependencies( + taskDescription.addedFiles, taskDescription.addedJars, taskDescription.addedArchives) task = ser.deserialize[Task[Any]]( taskDescription.serializedTask, Thread.currentThread.getContextClassLoader) task.localProperties = taskDescription.properties @@ -909,24 +913,42 @@ private[spark] class Executor( * Download any missing dependencies if we receive a new set of files and JARs from the * SparkContext. Also adds any new JARs we fetched to the class loader. */ - private def updateDependencies(newFiles: Map[String, Long], newJars: Map[String, Long]): Unit = { + private def updateDependencies( + newFiles: Map[String, Long], + newJars: Map[String, Long], + newArchives: Map[String, Long]): Unit = { lazy val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) synchronized { // Fetch missing dependencies for ((name, timestamp) <- newFiles if currentFiles.getOrElse(name, -1L) < timestamp) { - logInfo("Fetching " + name + " with timestamp " + timestamp) + logInfo(s"Fetching $name with timestamp $timestamp") // Fetch file with useCache mode, close cache for local mode. Utils.fetchFile(name, new File(SparkFiles.getRootDirectory()), conf, env.securityManager, hadoopConf, timestamp, useCache = !isLocal) currentFiles(name) = timestamp } + for ((name, timestamp) <- newArchives if currentArchives.getOrElse(name, -1L) < timestamp) { + logInfo(s"Fetching $name with timestamp $timestamp") + val sourceURI = new URI(name) + val uriToDownload = UriBuilder.fromUri(sourceURI).fragment(null).build() + val source = Utils.fetchFile(uriToDownload.toString, Utils.createTempDir(), conf, + env.securityManager, hadoopConf, timestamp, useCache = !isLocal, shouldUntar = false) + val dest = new File( + SparkFiles.getRootDirectory(), + if (sourceURI.getFragment != null) sourceURI.getFragment else source.getName) + logInfo( + s"Unpacking an archive $name from ${source.getAbsolutePath} to ${dest.getAbsolutePath}") + Utils.deleteRecursively(dest) + Utils.unpack(source, dest) + currentArchives(name) = timestamp + } for ((name, timestamp) <- newJars) { val localName = new URI(name).getPath.split("/").last val currentTimeStamp = currentJars.get(name) .orElse(currentJars.get(localName)) .getOrElse(-1L) if (currentTimeStamp < timestamp) { - logInfo("Fetching " + name + " with timestamp " + timestamp) + logInfo(s"Fetching $name with timestamp $timestamp") // Fetch file with useCache mode, close cache for local mode. Utils.fetchFile(name, new File(SparkFiles.getRootDirectory()), conf, env.securityManager, hadoopConf, timestamp, useCache = !isLocal) @@ -934,7 +956,7 @@ private[spark] class Executor( // Add it to our class loader val url = new File(SparkFiles.getRootDirectory(), localName).toURI.toURL if (!urlClassLoader.getURLs().contains(url)) { - logInfo("Adding " + url + " to class loader") + logInfo(s"Adding $url to class loader") urlClassLoader.addURL(url) } } diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index 093a0ecf58d32..6639f20a068d4 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -1813,6 +1813,16 @@ package object config { .toSequence .createWithDefault(Nil) + private[spark] val ARCHIVES = ConfigBuilder("spark.archives") + .version("3.1.0") + .doc("Comma-separated list of archives to be extracted into the working directory of each " + + "executor. .jar, .tar.gz, .tgz and .zip are supported. You can specify the directory " + + "name to unpack via adding '#' after the file name to unpack, for example, " + + "'file.zip#directory'. This configuration is experimental.") + .stringConf + .toSequence + .createWithDefault(Nil) + private[spark] val SUBMIT_DEPLOY_MODE = ConfigBuilder("spark.submit.deployMode") .version("1.5.0") .stringConf diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala index 863bf27088355..12b911d06153b 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala @@ -55,6 +55,7 @@ private[spark] class TaskDescription( val partitionId: Int, val addedFiles: Map[String, Long], val addedJars: Map[String, Long], + val addedArchives: Map[String, Long], val properties: Properties, val resources: immutable.Map[String, ResourceInformation], val serializedTask: ByteBuffer) { @@ -99,6 +100,9 @@ private[spark] object TaskDescription { // Write jars. serializeStringLongMap(taskDescription.addedJars, dataOut) + // Write archives. + serializeStringLongMap(taskDescription.addedArchives, dataOut) + // Write properties. dataOut.writeInt(taskDescription.properties.size()) taskDescription.properties.asScala.foreach { case (key, value) => @@ -167,6 +171,9 @@ private[spark] object TaskDescription { // Read jars. val taskJars = deserializeStringLongMap(dataIn) + // Read archives. + val taskArchives = deserializeStringLongMap(dataIn) + // Read properties. val properties = new Properties() val numProperties = dataIn.readInt() @@ -185,6 +192,6 @@ private[spark] object TaskDescription { val serializedTask = byteBuffer.slice() new TaskDescription(taskId, attemptNumber, executorId, name, index, partitionId, taskFiles, - taskJars, properties, resources, serializedTask) + taskJars, taskArchives, properties, resources, serializedTask) } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala index 914fccc1a67cd..ad0791fa42931 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala @@ -63,6 +63,7 @@ private[spark] class TaskSetManager( // SPARK-21563 make a copy of the jars/files so they are consistent across the TaskSet private val addedJars = HashMap[String, Long](sched.sc.addedJars.toSeq: _*) private val addedFiles = HashMap[String, Long](sched.sc.addedFiles.toSeq: _*) + private val addedArchives = HashMap[String, Long](sched.sc.addedArchives.toSeq: _*) val maxResultSize = conf.get(config.MAX_RESULT_SIZE) @@ -493,6 +494,7 @@ private[spark] class TaskSetManager( task.partitionId, addedFiles, addedJars, + addedArchives, task.localProperties, taskResourceAssignments, serializedTask) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index accf3d7c0d333..ae4df146b0a4c 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -53,6 +53,7 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FileUtil, Path} import org.apache.hadoop.io.compress.{CompressionCodecFactory, SplittableCompressionCodec} import org.apache.hadoop.security.UserGroupInformation +import org.apache.hadoop.util.{RunJar, StringUtils} import org.apache.hadoop.yarn.conf.YarnConfiguration import org.eclipse.jetty.util.MultiException import org.slf4j.Logger @@ -486,6 +487,10 @@ private[spark] object Utils extends Logging { * * Throws SparkException if the target file already exists and has different contents than * the requested file. + * + * If `shouldUntar` is true, it untars the given url if it is a tar.gz or tgz into `targetDir`. + * This is a legacy behavior, and users should better use `spark.archives` configuration or + * `SparkContext.addArchive` */ def fetchFile( url: String, @@ -494,7 +499,8 @@ private[spark] object Utils extends Logging { securityMgr: SecurityManager, hadoopConf: Configuration, timestamp: Long, - useCache: Boolean): File = { + useCache: Boolean, + shouldUntar: Boolean = true): File = { val fileName = decodeFileNameInURI(new URI(url)) val targetFile = new File(targetDir, fileName) val fetchCacheEnabled = conf.getBoolean("spark.files.useFetchCache", defaultValue = true) @@ -535,13 +541,23 @@ private[spark] object Utils extends Logging { doFetchFile(url, targetDir, fileName, conf, securityMgr, hadoopConf) } - // Decompress the file if it's a .tar or .tar.gz - if (fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz")) { - logInfo("Untarring " + fileName) - executeAndGetOutput(Seq("tar", "-xzf", fileName), targetDir) - } else if (fileName.endsWith(".tar")) { - logInfo("Untarring " + fileName) - executeAndGetOutput(Seq("tar", "-xf", fileName), targetDir) + if (shouldUntar) { + // Decompress the file if it's a .tar or .tar.gz + if (fileName.endsWith(".tar.gz") || fileName.endsWith(".tgz")) { + logWarning( + "Untarring behavior will be deprecated at spark.files and " + + "SparkContext.addFile. Consider using spark.archives or SparkContext.addArchive " + + "instead.") + logInfo("Untarring " + fileName) + executeAndGetOutput(Seq("tar", "-xzf", fileName), targetDir) + } else if (fileName.endsWith(".tar")) { + logWarning( + "Untarring behavior will be deprecated at spark.files and " + + "SparkContext.addFile. Consider using spark.archives or SparkContext.addArchive " + + "instead.") + logInfo("Untarring " + fileName) + executeAndGetOutput(Seq("tar", "-xf", fileName), targetDir) + } } // Make the file executable - That's necessary for scripts FileUtil.chmod(targetFile.getAbsolutePath, "a+x") @@ -555,6 +571,26 @@ private[spark] object Utils extends Logging { targetFile } + /** + * Unpacks an archive file into the specified directory. It expects .jar, .zip, .tar.gz, .tgz + * and .tar files. This behaves same as Hadoop's archive in distributed cache. This method is + * basically copied from `org.apache.hadoop.yarn.util.FSDownload.unpack`. + */ + def unpack(source: File, dest: File): Unit = { + val lowerSrc = StringUtils.toLowerCase(source.getName) + if (lowerSrc.endsWith(".jar")) { + RunJar.unJar(source, dest, RunJar.MATCH_ANY) + } else if (lowerSrc.endsWith(".zip")) { + FileUtil.unZip(source, dest) + } else if ( + lowerSrc.endsWith(".tar.gz") || lowerSrc.endsWith(".tgz") || lowerSrc.endsWith(".tar")) { + FileUtil.unTar(source, dest) + } else { + logWarning(s"Cannot unpack $source, just copying it to $dest.") + copyRecursive(source, dest) + } + } + /** Records the duration of running `body`. */ def timeTakenMs[T](body: => T): (T, Long) = { val startTime = System.nanoTime() diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala index ebdf2f59a2770..55bfa70f21fc2 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala @@ -160,6 +160,85 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu } } + test("SPARK-33530: basic case for addArchive and listArchives") { + withTempDir { dir => + val file1 = File.createTempFile("someprefix1", "somesuffix1", dir) + val file2 = File.createTempFile("someprefix2", "somesuffix2", dir) + val file3 = File.createTempFile("someprefix3", "somesuffix3", dir) + val file4 = File.createTempFile("someprefix4", "somesuffix4", dir) + + val jarFile = new File(dir, "test!@$jar.jar") + val zipFile = new File(dir, "test-zip.zip") + val relativePath1 = + s"${zipFile.getParent}/../${zipFile.getParentFile.getName}/${zipFile.getName}" + val relativePath2 = + s"${jarFile.getParent}/../${jarFile.getParentFile.getName}/${jarFile.getName}#zoo" + + try { + Files.write("somewords1", file1, StandardCharsets.UTF_8) + Files.write("somewords22", file2, StandardCharsets.UTF_8) + Files.write("somewords333", file3, StandardCharsets.UTF_8) + Files.write("somewords4444", file4, StandardCharsets.UTF_8) + val length1 = file1.length() + val length2 = file2.length() + val length3 = file1.length() + val length4 = file2.length() + + createJar(Seq(file1, file2), jarFile) + createJar(Seq(file3, file4), zipFile) + + sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local")) + sc.addArchive(jarFile.getAbsolutePath) + sc.addArchive(relativePath1) + sc.addArchive(s"${jarFile.getAbsolutePath}#foo") + sc.addArchive(s"${zipFile.getAbsolutePath}#bar") + sc.addArchive(relativePath2) + + sc.parallelize(Array(1), 1).map { x => + val gotten1 = new File(SparkFiles.get(jarFile.getName)) + val gotten2 = new File(SparkFiles.get(zipFile.getName)) + val gotten3 = new File(SparkFiles.get("foo")) + val gotten4 = new File(SparkFiles.get("bar")) + val gotten5 = new File(SparkFiles.get("zoo")) + + Seq(gotten1, gotten2, gotten3, gotten4, gotten5).foreach { gotten => + if (!gotten.exists()) { + throw new SparkException(s"The archive doesn't exist: ${gotten.getAbsolutePath}") + } + if (!gotten.isDirectory) { + throw new SparkException(s"The archive was not unpacked: ${gotten.getAbsolutePath}") + } + } + + // Jars + Seq(gotten1, gotten3, gotten5).foreach { gotten => + val actualLength1 = new File(gotten, file1.getName).length() + val actualLength2 = new File(gotten, file2.getName).length() + if (actualLength1 != length1 || actualLength2 != length2) { + s"Unpacked files have different lengths $actualLength1 and $actualLength2. at " + + s"${gotten.getAbsolutePath}. They should be $length1 and $length2." + } + } + + // Zip + Seq(gotten2, gotten4).foreach { gotten => + val actualLength3 = new File(gotten, file1.getName).length() + val actualLength4 = new File(gotten, file2.getName).length() + if (actualLength3 != length3 || actualLength4 != length4) { + s"Unpacked files have different lengths $actualLength3 and $actualLength4. at " + + s"${gotten.getAbsolutePath}. They should be $length3 and $length4." + } + } + x + }.count() + assert(sc.listArchives().count(_.endsWith("test!@$jar.jar")) == 1) + assert(sc.listArchives().count(_.contains("test-zip.zip")) == 2) + } finally { + sc.stop() + } + } + } + test("add and list jar files") { val jarPath = Thread.currentThread().getContextClassLoader.getResource("TestUDTF.jar") try { diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index b5b3751439750..dcd35f3f6b93f 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -335,6 +335,43 @@ class SparkSubmitSuite sys.props("SPARK_SUBMIT") should be ("true") } + test("SPARK-33530: handles standalone mode with archives") { + val clArgs = Seq( + "--master", "spark://localhost:1234", + "--executor-memory", "5g", + "--executor-cores", "5", + "--class", "org.SomeClass", + "--jars", "one.jar,two.jar,three.jar", + "--driver-memory", "4g", + "--files", "file1.txt,file2.txt", + "--archives", "archive1.zip,archive2.jar", + "--num-executors", "6", + "--name", "beauty", + "--conf", "spark.ui.enabled=false", + "thejar.jar", + "arg1", "arg2") + val appArgs = new SparkSubmitArguments(clArgs) + val (childArgs, classpath, conf, mainClass) = submit.prepareSubmitEnvironment(appArgs) + val childArgsStr = childArgs.mkString(" ") + childArgsStr should include ("arg1 arg2") + mainClass should be ("org.SomeClass") + + classpath(0) should endWith ("thejar.jar") + classpath(1) should endWith ("one.jar") + classpath(2) should endWith ("two.jar") + classpath(3) should endWith ("three.jar") + + conf.get("spark.executor.memory") should be ("5g") + conf.get("spark.driver.memory") should be ("4g") + conf.get("spark.executor.cores") should be ("5") + conf.get("spark.jars") should include regex (".*one.jar,.*two.jar,.*three.jar") + conf.get("spark.files") should include regex (".*file1.txt,.*file2.txt") + conf.get("spark.archives") should include regex (".*archive1.zip,.*archive2.jar") + conf.get("spark.app.name") should be ("beauty") + conf.get(UI_ENABLED) should be (false) + sys.props("SPARK_SUBMIT") should be ("true") + } + test("handles standalone cluster mode") { testStandaloneCluster(useRest = true) } diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala index d08052faa0043..9fdbf485e17d3 100644 --- a/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala @@ -98,6 +98,7 @@ class SubmitRestProtocolSuite extends SparkFunSuite { // optional fields conf.set(JARS, Seq("mayonnaise.jar", "ketchup.jar")) conf.set(FILES.key, "fireball.png") + conf.set(ARCHIVES.key, "fireballs.zip") conf.set("spark.driver.memory", s"${Utils.DEFAULT_DRIVER_MEM_MB}m") conf.set(DRIVER_CORES, 180) conf.set("spark.driver.extraJavaOptions", " -Dslices=5 -Dcolor=mostly_red") @@ -246,6 +247,7 @@ class SubmitRestProtocolSuite extends SparkFunSuite { | }, | "mainClass" : "org.apache.spark.examples.SparkPie", | "sparkProperties" : { + | "spark.archives" : "fireballs.zip", | "spark.driver.extraLibraryPath" : "pickle.jar", | "spark.jars" : "mayonnaise.jar,ketchup.jar", | "spark.driver.supervise" : "false", @@ -272,6 +274,7 @@ class SubmitRestProtocolSuite extends SparkFunSuite { | }, | "mainClass" : "org.apache.spark.examples.SparkPie", | "sparkProperties" : { + | "spark.archives" : "fireballs.zip", | "spark.driver.extraLibraryPath" : "pickle.jar", | "spark.jars" : "mayonnaise.jar,ketchup.jar", | "spark.driver.supervise" : "false", diff --git a/core/src/test/scala/org/apache/spark/executor/CoarseGrainedExecutorBackendSuite.scala b/core/src/test/scala/org/apache/spark/executor/CoarseGrainedExecutorBackendSuite.scala index 319dcfeecee24..810dcf0e61007 100644 --- a/core/src/test/scala/org/apache/spark/executor/CoarseGrainedExecutorBackendSuite.scala +++ b/core/src/test/scala/org/apache/spark/executor/CoarseGrainedExecutorBackendSuite.scala @@ -302,7 +302,7 @@ class CoarseGrainedExecutorBackendSuite extends SparkFunSuite // We don't really verify the data, just pass it around. val data = ByteBuffer.wrap(Array[Byte](1, 2, 3, 4)) val taskDescription = new TaskDescription(taskId, 2, "1", "TASK 1000000", 19, - 1, mutable.Map.empty, mutable.Map.empty, new Properties, + 1, mutable.Map.empty, mutable.Map.empty, mutable.Map.empty, new Properties, Map(GPU -> new ResourceInformation(GPU, Array("0", "1"))), data) val serializedTaskDescription = TaskDescription.encode(taskDescription) backend.executor = mock[Executor] diff --git a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala index 5b868604ecf94..7cf7a81a76133 100644 --- a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala +++ b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala @@ -519,6 +519,7 @@ class ExecutorSuite extends SparkFunSuite partitionId = 0, addedFiles = Map[String, Long](), addedJars = Map[String, Long](), + addedArchives = Map[String, Long](), properties = new Properties, resources = immutable.Map[String, ResourceInformation](), serializedTask) diff --git a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala index 65d51e57ee308..7a74dd877a042 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala @@ -244,7 +244,8 @@ class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkCo val taskResources = Map(GPU -> new ResourceInformation(GPU, Array("0"))) var taskDescs: Seq[Seq[TaskDescription]] = Seq(Seq(new TaskDescription(1, 0, "1", - "t1", 0, 1, mutable.Map.empty[String, Long], mutable.Map.empty[String, Long], + "t1", 0, 1, mutable.Map.empty[String, Long], + mutable.Map.empty[String, Long], mutable.Map.empty[String, Long], new Properties(), taskResources, bytebuffer))) val ts = backend.getTaskSchedulerImpl() when(ts.resourceOffers(any[IndexedSeq[WorkerOffer]], any[Boolean])).thenReturn(taskDescs) diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala index 915035e9eb71c..c4a8bcbb26a1d 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala @@ -91,7 +91,8 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit .set(key, secretPassword) val hadoopconf = SparkHadoopUtil.get.newConfiguration(new SparkConf()) val eventLogger = new EventLoggingListener("test", None, testDirPath.toUri(), conf) - val envDetails = SparkEnv.environmentDetails(conf, hadoopconf, "FIFO", Seq.empty, Seq.empty) + val envDetails = SparkEnv.environmentDetails( + conf, hadoopconf, "FIFO", Seq.empty, Seq.empty, Seq.empty) val event = SparkListenerEnvironmentUpdate(envDetails) val redactedProps = eventLogger.redactEvent(event).environmentDetails("Spark Properties").toMap assert(redactedProps(key) == "*********(redacted)") diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskDescriptionSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskDescriptionSuite.scala index 5839532f11666..98b5bada27646 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskDescriptionSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskDescriptionSuite.scala @@ -33,6 +33,10 @@ class TaskDescriptionSuite extends SparkFunSuite { originalFiles.put("fileUrl1", 1824) originalFiles.put("fileUrl2", 2) + val originalArchives = new HashMap[String, Long]() + originalArchives.put("archiveUrl1", 1824) + originalArchives.put("archiveUrl2", 2) + val originalJars = new HashMap[String, Long]() originalJars.put("jar1", 3) @@ -70,6 +74,7 @@ class TaskDescriptionSuite extends SparkFunSuite { partitionId = 1, originalFiles, originalJars, + originalArchives, originalProperties, originalResources, taskBuffer @@ -87,6 +92,7 @@ class TaskDescriptionSuite extends SparkFunSuite { assert(decodedTaskDescription.partitionId === originalTaskDescription.partitionId) assert(decodedTaskDescription.addedFiles.equals(originalFiles)) assert(decodedTaskDescription.addedJars.equals(originalJars)) + assert(decodedTaskDescription.addedArchives.equals(originalArchives)) assert(decodedTaskDescription.properties.equals(originalTaskDescription.properties)) assert(equalResources(decodedTaskDescription.resources, originalTaskDescription.resources)) assert(decodedTaskDescription.serializedTask.equals(taskBuffer)) diff --git a/docs/configuration.md b/docs/configuration.md index 76494b04c9279..d4d8e47645921 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -784,6 +784,17 @@ Apart from these, the following properties are also available, and may be useful 2.3.0 + + spark.archives + + + Comma-separated list of archives to be extracted into the working directory of each executor. + .jar, .tar.gz, .tgz and .zip are supported. You can specify the directory name to unpack via + adding # after the file name to unpack, for example, file.zip#directory. + This configuration is experimental. + + 3.1.0 + spark.pyspark.driver.python diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 5a66bfca27a27..9405927eb1cb5 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -41,6 +41,7 @@ object MimaExcludes { ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.shuffle.sort.io.LocalDiskShuffleMapOutputWriter.commitAllPartitions"), ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.shuffle.api.ShuffleMapOutputWriter.commitAllPartitions"), ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.shuffle.api.ShuffleMapOutputWriter.commitAllPartitions"), + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.SparkEnv.environmentDetails"), // mllib module ProblemFilters.exclude[NewMixinForwarderProblem]("org.apache.spark.ml.classification.LogisticRegressionTrainingSummary.totalIterations"), ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionTrainingSummary.$init$"), diff --git a/python/docs/source/user_guide/python_packaging.rst b/python/docs/source/user_guide/python_packaging.rst index ef4d05a8eefea..0aff6dc1d16b4 100644 --- a/python/docs/source/user_guide/python_packaging.rst +++ b/python/docs/source/user_guide/python_packaging.rst @@ -77,8 +77,7 @@ Using Zipped Virtual Environment -------------------------------- The idea of zipped environments is to zip your whole `virtual environment `_, -ship it to the cluster, unzip it remotely and target the Python interpreter from inside this zipped environment. Note that this -is currently supported *only for YARN*. +ship it to the cluster, unzip it remotely and target the Python interpreter from inside this zipped environment. Zip Virtual Environment ~~~~~~~~~~~~~~~~~~~~~~~ @@ -92,16 +91,15 @@ Example with `conda-pack`: .. code-block:: bash - conda create -y -n conda_env -c conda-forge \ - pyspark==3.0.1 pyarrow==0.15.1 pandas==0.25.3 conda-pack==0.4.0 - conda activate conda_env - conda pack -f -o conda_env.tar.gz + conda create -y -n pyspark_env -c conda-forge pyarrow==2.0.0 pandas==1.1.4 conda-pack==0.5.0 + conda activate pyspark_env + conda pack -f -o pyspark_env.tar.gz Upload to Spark Executors ~~~~~~~~~~~~~~~~~~~~~~~~~ Unzipping will be done by Spark when using target ``--archives`` option in spark-submit -or setting ``spark.yarn.dist.archives`` configuration. +or setting ``spark.archives`` configuration. Example with ``spark-submit``: @@ -109,8 +107,7 @@ Example with ``spark-submit``: export PYSPARK_DRIVER_PYTHON=python export PYSPARK_PYTHON=./environment/bin/python - spark-submit --master=yarn --deploy-mode client \ - --archives conda_env.tar.gz#environment app.py + spark-submit --master=... --archives pyspark_env.tar.gz#environment app.py Example using ``SparkSession.builder``: @@ -121,11 +118,17 @@ Example using ``SparkSession.builder``: from app import main os.environ['PYSPARK_PYTHON'] = "./environment/bin/python" - builder = SparkSession.builder.master("yarn").config( - "spark.yarn.dist.archives", "conda_env.tar.gz#environment") - spark = builder.getOrCreate() + spark = SparkSession.builder.master("...").config("spark.archives", "pyspark_env.tar.gz#environment").getOrCreate() main(spark) +Example with ``pyspark`` shell: + +.. code-block:: bash + + export PYSPARK_DRIVER_PYTHON=python + export PYSPARK_PYTHON=./environment/bin/python + pyspark --master=... --archives pyspark_env.tar.gz#environment + Using PEX --------- diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala index 6a6514569cf90..10030a20f0884 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala @@ -264,6 +264,7 @@ class MesosFineGrainedSchedulerBackendSuite partitionId = 0, addedFiles = mutable.Map.empty[String, Long], addedJars = mutable.Map.empty[String, Long], + addedArchives = mutable.Map.empty[String, Long], properties = new Properties(), resources = immutable.Map.empty[String, ResourceInformation], ByteBuffer.wrap(new Array[Byte](0))) @@ -377,6 +378,7 @@ class MesosFineGrainedSchedulerBackendSuite partitionId = 0, addedFiles = mutable.Map.empty[String, Long], addedJars = mutable.Map.empty[String, Long], + addedArchives = mutable.Map.empty[String, Long], properties = new Properties(), resources = immutable.Map.empty[String, ResourceInformation], ByteBuffer.wrap(new Array[Byte](0))) From 52e5cc46bc184bf582f9bc9ebcc5c8180222c421 Mon Sep 17 00:00:00 2001 From: "Jungtaek Lim (HeartSaVioR)" Date: Tue, 1 Dec 2020 14:42:48 +0900 Subject: [PATCH 061/150] [SPARK-27188][SS] FileStreamSink: provide a new option to have retention on output files ### What changes were proposed in this pull request? This patch proposes to provide a new option to specify time-to-live (TTL) for output file entries in FileStreamSink. TTL is defined via current timestamp - the last modified time for the file. This patch will filter out outdated output files in metadata while compacting batches (other batches don't have functionality to clean entries), which helps metadata to not grow linearly, as well as filtered out files will be "eventually" no longer seen in reader queries which leverage File(Stream)Source. ### Why are the changes needed? The metadata log greatly helps to easily achieve exactly-once but given the output path is open to arbitrary readers, there's no way to compact the metadata log, which ends up growing the metadata file as query runs for long time, especially for compacted batch. Lots of end users have been reporting the issue: see comments in [SPARK-24295](https://issues.apache.org/jira/browse/SPARK-24295) and [SPARK-29995](https://issues.apache.org/jira/browse/SPARK-29995), and [SPARK-30462](https://issues.apache.org/jira/browse/SPARK-30462). (There're some reports from end users which include their workarounds: SPARK-24295) ### Does this PR introduce any user-facing change? No, as the configuration is new and by default it is not applied. ### How was this patch tested? New UT. Closes #28363 from HeartSaVioR/SPARK-27188-v2. Lead-authored-by: Jungtaek Lim (HeartSaVioR) Co-authored-by: Jungtaek Lim (HeartSaVioR) Signed-off-by: Jungtaek Lim (HeartSaVioR) --- .../structured-streaming-programming-guide.md | 6 +- .../streaming/CompactibleFileStreamLog.scala | 8 +- .../execution/streaming/FileStreamSink.scala | 7 +- .../streaming/FileStreamSinkLog.scala | 25 +++++- .../streaming/FileStreamSinkLogSuite.scala | 77 +++++++++++-------- 5 files changed, 83 insertions(+), 40 deletions(-) diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md index c671d6b590626..6995ee2475aee 100644 --- a/docs/structured-streaming-programming-guide.md +++ b/docs/structured-streaming-programming-guide.md @@ -1874,7 +1874,11 @@ Here are the details of all the sinks in Spark. File Sink Append - path: path to the output directory, must be specified. + path: path to the output directory, must be specified.
      + retention: time to live (TTL) for output files. Output files which batches were + committed older than TTL will be eventually excluded in metadata log. This means reader queries which read + the sink's output directory may not process them. You can provide the value as string format of the time. (like "12h", "7d", etc.) + By default it's disabled.

      For file-format-specific options, see the related methods in DataFrameWriter (Scala/Java/Python/ - filterInBatch(id)(shouldRetain).getOrElse { + filterInBatch(id)(shouldRetain(_, curTime)).getOrElse { throw new IllegalStateException( s"${batchIdToPath(id)} doesn't exist " + s"(latestId: $latestId, compactInterval: $compactInterval)") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala index ecaf4f8160a06..e1c9b82ec2ac9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala @@ -29,7 +29,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, FileFormat, FileFormatWriter} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.util.SerializableConfiguration +import org.apache.spark.util.{SerializableConfiguration, Utils} object FileStreamSink extends Logging { // The name of the subdirectory that is used to store metadata about which files are valid. @@ -136,8 +136,9 @@ class FileStreamSink( private val basePath = new Path(path) private val logPath = getMetadataLogPath(basePath.getFileSystem(hadoopConf), basePath, sparkSession.sessionState.conf) - private val fileLog = - new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, logPath.toString) + private val retention = options.get("retention").map(Utils.timeStringAsMs) + private val fileLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, + logPath.toString, retention) private def basicWriteJobStatsTracker: BasicWriteJobStatsTracker = { val serializableHadoopConf = new SerializableConfiguration(hadoopConf) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala index 5cb68e1ae956e..2d70d95c6850d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala @@ -81,7 +81,8 @@ object SinkFileStatus { class FileStreamSinkLog( metadataLogVersion: Int, sparkSession: SparkSession, - path: String) + path: String, + _retentionMs: Option[Long] = None) extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) { private implicit val formats = Serialization.formats(NoTypeHints) @@ -96,6 +97,28 @@ class FileStreamSinkLog( require(defaultCompactInterval > 0, s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $defaultCompactInterval) " + "to a positive value.") + + val retentionMs: Long = _retentionMs match { + case Some(retention) => + logInfo(s"Retention is set to $retention ms") + retention + + case _ => Long.MaxValue + } + + override def shouldRetain(log: SinkFileStatus, currentTime: Long): Boolean = { + if (retentionMs < Long.MaxValue) { + if (currentTime - log.modificationTime > retentionMs) { + logDebug(s"${log.path} excluded by retention - current time: $currentTime / " + + s"modification time: ${log.modificationTime} / retention: $retentionMs ms.") + false + } else { + true + } + } else { + true + } + } } object FileStreamSinkLog { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala index 622d69e188821..d6707e7be71fc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala @@ -25,7 +25,7 @@ import java.util.concurrent.ConcurrentHashMap import scala.util.Random -import org.apache.hadoop.fs.{FSDataInputStream, Path, RawLocalFileSystem} +import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, Path, RawLocalFileSystem} import org.apache.spark.SparkFunSuite import org.apache.spark.sql.internal.SQLConf @@ -39,7 +39,7 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSparkSession { test("shouldRetain") { withFileStreamSinkLog { sinkLog => val log = newFakeSinkFileStatus("/a/b/x", FileStreamSinkLog.ADD_ACTION) - assert(sinkLog.shouldRetain(log)) + assert(sinkLog.shouldRetain(log, System.currentTimeMillis())) } } @@ -129,6 +129,17 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSparkSession { } } + private def listBatchFiles(fs: FileSystem, sinkLog: FileStreamSinkLog): Set[String] = { + fs.listStatus(sinkLog.metadataPath).map(_.getPath.getName).filter { fileName => + try { + getBatchIdFromFileName(fileName) + true + } catch { + case _: NumberFormatException => false + } + }.toSet + } + test("delete expired file") { // Set FILE_SINK_LOG_CLEANUP_DELAY to 0 so that we can detect the deleting behaviour // deterministically and one min batches to retain @@ -138,18 +149,7 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSparkSession { SQLConf.MIN_BATCHES_TO_RETAIN.key -> "1") { withFileStreamSinkLog { sinkLog => val fs = sinkLog.metadataPath.getFileSystem(spark.sessionState.newHadoopConf()) - - def listBatchFiles(): Set[String] = { - fs.listStatus(sinkLog.metadataPath).map(_.getPath.getName).filter { fileName => - try { - getBatchIdFromFileName(fileName) - true - } catch { - case _: NumberFormatException => false - } - }.toSet - } - + def listBatchFiles(): Set[String] = this.listBatchFiles(fs, sinkLog) sinkLog.add(0, Array(newFakeSinkFileStatus("/a/b/0", FileStreamSinkLog.ADD_ACTION))) assert(Set("0") === listBatchFiles()) sinkLog.add(1, Array(newFakeSinkFileStatus("/a/b/1", FileStreamSinkLog.ADD_ACTION))) @@ -173,18 +173,7 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSparkSession { SQLConf.MIN_BATCHES_TO_RETAIN.key -> "2") { withFileStreamSinkLog { sinkLog => val fs = sinkLog.metadataPath.getFileSystem(spark.sessionState.newHadoopConf()) - - def listBatchFiles(): Set[String] = { - fs.listStatus(sinkLog.metadataPath).map(_.getPath.getName).filter { fileName => - try { - getBatchIdFromFileName(fileName) - true - } catch { - case _: NumberFormatException => false - } - }.toSet - } - + def listBatchFiles(): Set[String] = this.listBatchFiles(fs, sinkLog) sinkLog.add(0, Array(newFakeSinkFileStatus("/a/b/0", FileStreamSinkLog.ADD_ACTION))) assert(Set("0") === listBatchFiles()) sinkLog.add(1, Array(newFakeSinkFileStatus("/a/b/1", FileStreamSinkLog.ADD_ACTION))) @@ -205,6 +194,24 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSparkSession { } } + test("filter out outdated entries when compacting") { + val curTime = System.currentTimeMillis() + withFileStreamSinkLog(sinkLog => { + val logs = Seq( + newFakeSinkFileStatus("/a/b/x", FileStreamSinkLog.ADD_ACTION, curTime), + newFakeSinkFileStatus("/a/b/y", FileStreamSinkLog.ADD_ACTION, curTime), + newFakeSinkFileStatus("/a/b/z", FileStreamSinkLog.ADD_ACTION, curTime)) + logs.foreach { log => assert(sinkLog.shouldRetain(log, curTime)) } + + val logs2 = Seq( + newFakeSinkFileStatus("/a/b/m", FileStreamSinkLog.ADD_ACTION, curTime - 80000), + newFakeSinkFileStatus("/a/b/n", FileStreamSinkLog.ADD_ACTION, curTime - 120000)) + logs2.foreach { log => + assert(!sinkLog.shouldRetain(log, curTime)) + } + }, Some(60000)) + } + test("read Spark 2.1.0 log format") { assert(readFromResource("file-sink-log-version-2.1.0") === Seq( SinkFileStatus("/a/b/0", 1, false, 1, 1, 100, FileStreamSinkLog.ADD_ACTION), @@ -259,23 +266,29 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSparkSession { } /** - * Create a fake SinkFileStatus using path and action. Most of tests don't care about other fields - * in SinkFileStatus. + * Create a fake SinkFileStatus using path and action, and optionally modification time. + * Most of tests don't care about other fields in SinkFileStatus. */ - private def newFakeSinkFileStatus(path: String, action: String): SinkFileStatus = { + private def newFakeSinkFileStatus( + path: String, + action: String, + modificationTime: Long = Long.MaxValue): SinkFileStatus = { SinkFileStatus( path = path, size = 100L, isDir = false, - modificationTime = 100L, + modificationTime = modificationTime, blockReplication = 1, blockSize = 100L, action = action) } - private def withFileStreamSinkLog(f: FileStreamSinkLog => Unit): Unit = { + private def withFileStreamSinkLog( + f: FileStreamSinkLog => Unit, + ttl: Option[Long] = None): Unit = { withTempDir { file => - val sinkLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, spark, file.getCanonicalPath) + val sinkLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, spark, file.getCanonicalPath, + ttl) f(sinkLog) } } From 103481551979297729123aaa56896d182d74847f Mon Sep 17 00:00:00 2001 From: "zky.zhoukeyong" Date: Tue, 1 Dec 2020 11:07:16 +0000 Subject: [PATCH 062/150] [SPARK-33572][SQL] Datetime building should fail if the year, month, ..., second combination is invalid ### What changes were proposed in this pull request? Datetime building should fail if the year, month, ..., second combination is invalid, when ANSI mode is enabled. This patch should update MakeDate, MakeTimestamp and MakeInterval. ### Why are the changes needed? For ANSI mode. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added UT and Existing UT. Closes #30516 from waitinfuture/SPARK-33498. Lead-authored-by: zky.zhoukeyong Co-authored-by: waitinfuture Signed-off-by: Wenchen Fan --- .../expressions/datetimeExpressions.scala | 27 ++-- .../expressions/intervalExpressions.scala | 23 +++- .../expressions/DateExpressionsSuite.scala | 118 ++++++++++++------ .../IntervalExpressionsSuite.scala | 60 +++++++++ .../sql-tests/results/postgreSQL/date.sql.out | 15 ++- 5 files changed, 187 insertions(+), 56 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 1ff5833fb4dd6..bbf1e4657f351 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -1789,31 +1789,36 @@ private case class GetTimestamp( """, group = "datetime_funcs", since = "3.0.0") -case class MakeDate(year: Expression, month: Expression, day: Expression) +case class MakeDate(year: Expression, month: Expression, day: Expression, + failOnError: Boolean = SQLConf.get.ansiEnabled) extends TernaryExpression with ImplicitCastInputTypes with NullIntolerant { + def this(year: Expression, month: Expression, day: Expression) = + this(year, month, day, SQLConf.get.ansiEnabled) + override def children: Seq[Expression] = Seq(year, month, day) override def inputTypes: Seq[AbstractDataType] = Seq(IntegerType, IntegerType, IntegerType) override def dataType: DataType = DateType - override def nullable: Boolean = true + override def nullable: Boolean = if (failOnError) children.exists(_.nullable) else true override def nullSafeEval(year: Any, month: Any, day: Any): Any = { try { val ld = LocalDate.of(year.asInstanceOf[Int], month.asInstanceOf[Int], day.asInstanceOf[Int]) localDateToDays(ld) } catch { - case _: java.time.DateTimeException => null + case _: java.time.DateTimeException if !failOnError => null } } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val dtu = DateTimeUtils.getClass.getName.stripSuffix("$") + val failOnErrorBranch = if (failOnError) "throw e;" else s"${ev.isNull} = true;" nullSafeCodeGen(ctx, ev, (year, month, day) => { s""" try { ${ev.value} = $dtu.localDateToDays(java.time.LocalDate.of($year, $month, $day)); } catch (java.time.DateTimeException e) { - ${ev.isNull} = true; + $failOnErrorBranch }""" }) } @@ -1860,7 +1865,8 @@ case class MakeTimestamp( min: Expression, sec: Expression, timezone: Option[Expression] = None, - timeZoneId: Option[String] = None) + timeZoneId: Option[String] = None, + failOnError: Boolean = SQLConf.get.ansiEnabled) extends SeptenaryExpression with TimeZoneAwareExpression with ImplicitCastInputTypes with NullIntolerant { @@ -1871,7 +1877,7 @@ case class MakeTimestamp( hour: Expression, min: Expression, sec: Expression) = { - this(year, month, day, hour, min, sec, None, None) + this(year, month, day, hour, min, sec, None, None, SQLConf.get.ansiEnabled) } def this( @@ -1882,7 +1888,7 @@ case class MakeTimestamp( min: Expression, sec: Expression, timezone: Expression) = { - this(year, month, day, hour, min, sec, Some(timezone), None) + this(year, month, day, hour, min, sec, Some(timezone), None, SQLConf.get.ansiEnabled) } override def children: Seq[Expression] = Seq(year, month, day, hour, min, sec) ++ timezone @@ -1892,7 +1898,7 @@ case class MakeTimestamp( Seq(IntegerType, IntegerType, IntegerType, IntegerType, IntegerType, DecimalType(8, 6)) ++ timezone.map(_ => StringType) override def dataType: DataType = TimestampType - override def nullable: Boolean = true + override def nullable: Boolean = if (failOnError) children.exists(_.nullable) else true override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression = copy(timeZoneId = Option(timeZoneId)) @@ -1926,7 +1932,7 @@ case class MakeTimestamp( } instantToMicros(ldt.atZone(zoneId).toInstant) } catch { - case _: DateTimeException => null + case _: DateTimeException if !failOnError => null } } @@ -1955,6 +1961,7 @@ case class MakeTimestamp( val dtu = DateTimeUtils.getClass.getName.stripSuffix("$") val zid = ctx.addReferenceObj("zoneId", zoneId, classOf[ZoneId].getName) val d = Decimal.getClass.getName.stripSuffix("$") + val failOnErrorBranch = if (failOnError) "throw e;" else s"${ev.isNull} = true;" nullSafeCodeGen(ctx, ev, (year, month, day, hour, min, secAndNanos, timezone) => { val zoneId = timezone.map(tz => s"$dtu.getZoneId(${tz}.toString())").getOrElse(zid) s""" @@ -1978,7 +1985,7 @@ case class MakeTimestamp( java.time.Instant instant = ldt.atZone($zoneId).toInstant(); ${ev.value} = $dtu.instantToMicros(instant); } catch (java.time.DateTimeException e) { - ${ev.isNull} = true; + $failOnErrorBranch }""" }) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/intervalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/intervalExpressions.scala index 8b92c619df626..6219457bba994 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/intervalExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/intervalExpressions.scala @@ -161,9 +161,20 @@ case class MakeInterval( days: Expression, hours: Expression, mins: Expression, - secs: Expression) + secs: Expression, + failOnError: Boolean = SQLConf.get.ansiEnabled) extends SeptenaryExpression with ImplicitCastInputTypes with NullIntolerant { + def this( + years: Expression, + months: Expression, + weeks: Expression, + days: Expression, + hours: Expression, + mins: Expression, + sec: Expression) = { + this(years, months, weeks, days, hours, mins, sec, SQLConf.get.ansiEnabled) + } def this( years: Expression, months: Expression, @@ -171,7 +182,8 @@ case class MakeInterval( days: Expression, hours: Expression, mins: Expression) = { - this(years, months, weeks, days, hours, mins, Literal(Decimal(0, Decimal.MAX_LONG_DIGITS, 6))) + this(years, months, weeks, days, hours, mins, Literal(Decimal(0, Decimal.MAX_LONG_DIGITS, 6)), + SQLConf.get.ansiEnabled) } def this( years: Expression, @@ -195,7 +207,7 @@ case class MakeInterval( override def inputTypes: Seq[AbstractDataType] = Seq(IntegerType, IntegerType, IntegerType, IntegerType, IntegerType, IntegerType, DecimalType(Decimal.MAX_LONG_DIGITS, 6)) override def dataType: DataType = CalendarIntervalType - override def nullable: Boolean = true + override def nullable: Boolean = if (failOnError) children.exists(_.nullable) else true override def nullSafeEval( year: Any, @@ -215,7 +227,7 @@ case class MakeInterval( min.asInstanceOf[Int], sec.map(_.asInstanceOf[Decimal]).getOrElse(Decimal(0, Decimal.MAX_LONG_DIGITS, 6))) } catch { - case _: ArithmeticException => null + case _: ArithmeticException if !failOnError => null } } @@ -223,11 +235,12 @@ case class MakeInterval( nullSafeCodeGen(ctx, ev, (year, month, week, day, hour, min, sec) => { val iu = IntervalUtils.getClass.getName.stripSuffix("$") val secFrac = sec.getOrElse("0") + val faileOnErrorBranch = if (failOnError) "throw e;" else s"${ev.isNull} = true;" s""" try { ${ev.value} = $iu.makeInterval($year, $month, $week, $day, $hour, $min, $secFrac); } catch (java.lang.ArithmeticException e) { - ${ev.isNull} = true; + $faileOnErrorBranch } """ }) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala index a3ffc1129fd5e..587ca0cdbed6e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions import java.sql.{Date, Timestamp} import java.text.{ParseException, SimpleDateFormat} -import java.time.{Instant, LocalDate, ZoneId} +import java.time.{DateTimeException, Instant, LocalDate, ZoneId} import java.time.format.DateTimeParseException import java.util.{Calendar, Locale, TimeZone} import java.util.concurrent.TimeUnit._ @@ -1014,49 +1014,97 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { } test("creating values of DateType via make_date") { - checkEvaluation(MakeDate(Literal(2013), Literal(7), Literal(15)), Date.valueOf("2013-7-15")) - checkEvaluation(MakeDate(Literal.create(null, IntegerType), Literal(7), Literal(15)), null) - checkEvaluation(MakeDate(Literal(2019), Literal.create(null, IntegerType), Literal(19)), null) - checkEvaluation(MakeDate(Literal(2019), Literal(7), Literal.create(null, IntegerType)), null) - checkEvaluation(MakeDate(Literal(Int.MaxValue), Literal(13), Literal(19)), null) - checkEvaluation(MakeDate(Literal(2019), Literal(13), Literal(19)), null) - checkEvaluation(MakeDate(Literal(2019), Literal(7), Literal(32)), null) + Seq(true, false).foreach({ ansi => + withSQLConf(SQLConf.ANSI_ENABLED.key -> ansi.toString) { + checkEvaluation(MakeDate(Literal(2013), Literal(7), Literal(15)), Date.valueOf("2013-7-15")) + checkEvaluation(MakeDate(Literal.create(null, IntegerType), Literal(7), Literal(15)), null) + checkEvaluation(MakeDate(Literal(2019), Literal.create(null, IntegerType), Literal(19)), + null) + checkEvaluation(MakeDate(Literal(2019), Literal(7), Literal.create(null, IntegerType)), + null) + } + }) + + // ansi test + withSQLConf(SQLConf.ANSI_ENABLED.key -> "true") { + checkExceptionInExpression[DateTimeException](MakeDate(Literal(Int.MaxValue), Literal(13), + Literal(19)), EmptyRow, "Invalid value for Year") + checkExceptionInExpression[DateTimeException](MakeDate(Literal(2019), + Literal(13), Literal(19)), EmptyRow, "Invalid value for Month") + checkExceptionInExpression[DateTimeException](MakeDate(Literal(2019), Literal(7), + Literal(32)), EmptyRow, "Invalid value for Day") + } + + // non-ansi test + withSQLConf(SQLConf.ANSI_ENABLED.key -> "false") { + checkEvaluation(MakeDate(Literal(Int.MaxValue), Literal(13), Literal(19)), null) + checkEvaluation(MakeDate(Literal(2019), Literal(13), Literal(19)), null) + checkEvaluation(MakeDate(Literal(2019), Literal(7), Literal(32)), null) + } } test("creating values of TimestampType via make_timestamp") { - var makeTimestampExpr = MakeTimestamp( - Literal(2013), Literal(7), Literal(15), Literal(8), Literal(15), - Literal(Decimal(BigDecimal(23.5), 8, 6)), Some(Literal(ZoneId.systemDefault().getId))) val expected = Timestamp.valueOf("2013-7-15 8:15:23.5") - checkEvaluation(makeTimestampExpr, expected) - checkEvaluation(makeTimestampExpr.copy(timezone = None), expected) - - checkEvaluation(makeTimestampExpr.copy(year = Literal.create(null, IntegerType)), null) - checkEvaluation(makeTimestampExpr.copy(year = Literal(Int.MaxValue)), null) - - checkEvaluation(makeTimestampExpr.copy(month = Literal.create(null, IntegerType)), null) - checkEvaluation(makeTimestampExpr.copy(month = Literal(13)), null) - - checkEvaluation(makeTimestampExpr.copy(day = Literal.create(null, IntegerType)), null) - checkEvaluation(makeTimestampExpr.copy(day = Literal(32)), null) - checkEvaluation(makeTimestampExpr.copy(hour = Literal.create(null, IntegerType)), null) - checkEvaluation(makeTimestampExpr.copy(hour = Literal(25)), null) + Seq(true, false).foreach { ansi => + withSQLConf(SQLConf.ANSI_ENABLED.key -> ansi.toString) { + var makeTimestampExpr = MakeTimestamp( + Literal(2013), Literal(7), Literal(15), Literal(8), Literal(15), + Literal(Decimal(BigDecimal(23.5), 8, 6)), Some(Literal(ZoneId.systemDefault().getId))) + checkEvaluation(makeTimestampExpr, expected) + checkEvaluation(makeTimestampExpr.copy(year = Literal.create(null, IntegerType)), null) + checkEvaluation(makeTimestampExpr.copy(month = Literal.create(null, IntegerType)), null) + checkEvaluation(makeTimestampExpr.copy(day = Literal.create(null, IntegerType)), null) + checkEvaluation(makeTimestampExpr.copy(hour = Literal.create(null, IntegerType)), null) + checkEvaluation(makeTimestampExpr.copy(min = Literal.create(null, IntegerType)), null) + checkEvaluation(makeTimestampExpr.copy(sec = Literal.create(null, DecimalType(8, 6))), null) + checkEvaluation(makeTimestampExpr.copy(timezone = None), expected) + + Seq( + (makeTimestampExpr.copy(year = Literal(Int.MaxValue)), "Invalid value for Year"), + (makeTimestampExpr.copy(month = Literal(13)), "Invalid value for Month"), + (makeTimestampExpr.copy(day = Literal(32)), "Invalid value for Day"), + (makeTimestampExpr.copy(hour = Literal(25)), "Invalid value for Hour"), + (makeTimestampExpr.copy(min = Literal(65)), "Invalid value for Min"), + (makeTimestampExpr.copy(sec = Literal(Decimal( + BigDecimal(70.0), 8, 6))), "Invalid value for Second") + ).foreach { entry => + if (ansi) { + checkExceptionInExpression[DateTimeException](entry._1, EmptyRow, entry._2) + } else { + checkEvaluation(entry._1, null) + } + } - checkEvaluation(makeTimestampExpr.copy(min = Literal.create(null, IntegerType)), null) - checkEvaluation(makeTimestampExpr.copy(min = Literal(65)), null) + makeTimestampExpr = MakeTimestamp(Literal(2019), Literal(6), Literal(30), + Literal(23), Literal(59), Literal(Decimal(BigDecimal(60.0), 8, 6))) + if (ansi) { + checkExceptionInExpression[DateTimeException](makeTimestampExpr.copy(sec = Literal( + Decimal(BigDecimal(60.5), 8, 6))), EmptyRow, "The fraction of sec must be zero") + } else { + checkEvaluation(makeTimestampExpr, Timestamp.valueOf("2019-07-01 00:00:00")) + } - checkEvaluation(makeTimestampExpr.copy(sec = Literal.create(null, DecimalType(8, 6))), null) - checkEvaluation(makeTimestampExpr.copy(sec = Literal(Decimal(BigDecimal(70.0), 8, 6))), null) + makeTimestampExpr = MakeTimestamp(Literal(2019), Literal(8), Literal(12), Literal(0), + Literal(0), Literal(Decimal(BigDecimal(58.000001), 8, 6))) + checkEvaluation(makeTimestampExpr, Timestamp.valueOf("2019-08-12 00:00:58.000001")) + } + } - makeTimestampExpr = MakeTimestamp(Literal(2019), Literal(6), Literal(30), - Literal(23), Literal(59), Literal(Decimal(BigDecimal(60.0), 8, 6))) - checkEvaluation(makeTimestampExpr, Timestamp.valueOf("2019-07-01 00:00:00")) - checkEvaluation(makeTimestampExpr.copy(sec = Literal(Decimal(BigDecimal(60.5), 8, 6))), null) + // non-ansi test + withSQLConf(SQLConf.ANSI_ENABLED.key -> "false") { + val makeTimestampExpr = MakeTimestamp(Literal(2019), Literal(6), Literal(30), + Literal(23), Literal(59), Literal(Decimal(BigDecimal(60.0), 8, 6))) + checkEvaluation(makeTimestampExpr.copy(sec = Literal(Decimal(BigDecimal(60.5), 8, 6))), null) + } - makeTimestampExpr = MakeTimestamp(Literal(2019), Literal(8), Literal(12), - Literal(0), Literal(0), Literal(Decimal(BigDecimal(58.000001), 8, 6))) - checkEvaluation(makeTimestampExpr, Timestamp.valueOf("2019-08-12 00:00:58.000001")) + Seq(true, false).foreach { ansi => + withSQLConf(SQLConf.ANSI_ENABLED.key -> ansi.toString) { + val makeTimestampExpr = MakeTimestamp(Literal(2019), Literal(8), Literal(12), + Literal(0), Literal(0), Literal(Decimal(BigDecimal(58.000001), 8, 6))) + checkEvaluation(makeTimestampExpr, Timestamp.valueOf("2019-08-12 00:00:58.000001")) + } + } } test("ISO 8601 week-numbering year") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala index 6b7be4f1609a5..5c73a91de4f79 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala @@ -214,4 +214,64 @@ class IntervalExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { millis = Int.MaxValue, micros = Int.MaxValue) } + + test("ANSI mode: make interval") { + def check( + years: Int = 0, + months: Int = 0, + weeks: Int = 0, + days: Int = 0, + hours: Int = 0, + minutes: Int = 0, + seconds: Int = 0, + millis: Int = 0, + micros: Int = 0): Unit = { + val secFrac = DateTimeTestUtils.secFrac(seconds, millis, micros) + val intervalExpr = MakeInterval(Literal(years), Literal(months), Literal(weeks), + Literal(days), Literal(hours), Literal(minutes), + Literal(Decimal(secFrac, Decimal.MAX_LONG_DIGITS, 6))) + val totalMonths = years * MONTHS_PER_YEAR + months + val totalDays = weeks * DAYS_PER_WEEK + days + val totalMicros = secFrac + minutes * MICROS_PER_MINUTE + hours * MICROS_PER_HOUR + val expected = new CalendarInterval(totalMonths, totalDays, totalMicros) + checkEvaluation(intervalExpr, expected) + } + + def checkException( + years: Int = 0, + months: Int = 0, + weeks: Int = 0, + days: Int = 0, + hours: Int = 0, + minutes: Int = 0, + seconds: Int = 0, + millis: Int = 0, + micros: Int = 0): Unit = { + val secFrac = DateTimeTestUtils.secFrac(seconds, millis, micros) + val intervalExpr = MakeInterval(Literal(years), Literal(months), Literal(weeks), + Literal(days), Literal(hours), Literal(minutes), + Literal(Decimal(secFrac, Decimal.MAX_LONG_DIGITS, 6))) + checkExceptionInExpression[ArithmeticException](intervalExpr, EmptyRow, "") + } + + withSQLConf(SQLConf.ANSI_ENABLED.key -> "true") { + check(months = 0, days = 0, micros = 0) + check(years = -123) + check(weeks = 123) + check(millis = -123) + check(9999, 11, 0, 31, 23, 59, 59, 999, 999) + check(years = 10000, micros = -1) + check(-9999, -11, 0, -31, -23, -59, -59, -999, -999) + check(years = -10000, micros = 1) + check( + hours = Int.MaxValue, + minutes = Int.MaxValue, + seconds = Int.MaxValue, + millis = Int.MaxValue, + micros = Int.MaxValue) + + checkException(years = Int.MaxValue) + checkException(weeks = Int.MaxValue) + } + } } diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/date.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/date.sql.out index 151fa1e28d725..a959284750483 100755 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/date.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/date.sql.out @@ -590,25 +590,28 @@ struct -- !query select make_date(2013, 2, 30) -- !query schema -struct +struct<> -- !query output -NULL +java.time.DateTimeException +Invalid date 'FEBRUARY 30' -- !query select make_date(2013, 13, 1) -- !query schema -struct +struct<> -- !query output -NULL +java.time.DateTimeException +Invalid value for MonthOfYear (valid values 1 - 12): 13 -- !query select make_date(2013, 11, -1) -- !query schema -struct +struct<> -- !query output -NULL +java.time.DateTimeException +Invalid value for DayOfMonth (valid values 1 - 28/31): -1 -- !query From e5bb2937f6682239e83605b65214dfca3bdd50e5 Mon Sep 17 00:00:00 2001 From: Gabor Somogyi Date: Tue, 1 Dec 2020 20:34:00 +0900 Subject: [PATCH 063/150] [SPARK-32032][SS] Avoid infinite wait in driver because of KafkaConsumer.poll(long) API ### What changes were proposed in this pull request? Deprecated `KafkaConsumer.poll(long)` API calls may cause infinite wait in the driver. In this PR I've added a new `AdminClient` based offset fetching which is turned off by default. There is a new flag named `spark.sql.streaming.kafka.useDeprecatedOffsetFetching` (default: `true`) which can be set to `false` to reach the newly added functionality. The Structured Streaming migration guide contains more information what migration consideration must be done. Please see the following [doc](https://docs.google.com/document/d/1gAh0pKgZUgyqO2Re3sAy-fdYpe_SxpJ6DkeXE8R1P7E/edit?usp=sharing) for further details. The PR contains the following changes: * Added `AdminClient` based offset fetching * GroupId prefix feature removed from driver but only in `AdminClient` based approach (`AdminClient` doesn't need any GroupId) * GroupId override feature removed from driver but only in `AdminClient` based approach (`AdminClient` doesn't need any GroupId) * Additional unit tests * Code comment changes * Minor bugfixes here and there * Removed Kafka auto topic creation feature but only in `AdminClient` based approach (please see doc for rationale). In short, it's super hidden, not sure anybody ever used in production + error prone. * Added documentation to `ss-migration-guide` and `structured-streaming-kafka-integration` ### Why are the changes needed? Driver may hang forever. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing + additional unit tests. Cluster test with simple Kafka topic to another topic query. Documentation: ``` cd docs/ SKIP_API=1 jekyll build ``` Manual webpage check. Closes #29729 from gaborgsomogyi/SPARK-32032. Authored-by: Gabor Somogyi Signed-off-by: Jungtaek Lim (HeartSaVioR) --- docs/ss-migration-guide.md | 5 + .../structured-streaming-kafka-integration.md | 20 + .../spark/sql/kafka010/ConsumerStrategy.scala | 65 +- .../spark/sql/kafka010/KafkaBatch.scala | 2 +- .../sql/kafka010/KafkaOffsetReader.scala | 601 +---------------- .../sql/kafka010/KafkaOffsetReaderAdmin.scala | 573 ++++++++++++++++ .../kafka010/KafkaOffsetReaderConsumer.scala | 614 ++++++++++++++++++ .../spark/sql/kafka010/KafkaRelation.scala | 2 +- .../sql/kafka010/KafkaSourceProvider.scala | 6 +- .../sql/kafka010/ConsumerStrategySuite.scala | 147 +++++ .../kafka010/KafkaMicroBatchSourceSuite.scala | 42 +- .../sql/kafka010/KafkaOffsetReaderSuite.scala | 95 ++- .../sql/kafka010/KafkaRelationSuite.scala | 47 +- .../apache/spark/sql/internal/SQLConf.scala | 13 + 14 files changed, 1587 insertions(+), 645 deletions(-) create mode 100644 external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderAdmin.scala create mode 100644 external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderConsumer.scala create mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/ConsumerStrategySuite.scala diff --git a/docs/ss-migration-guide.md b/docs/ss-migration-guide.md index d52b2e095fc76..480e5e2695a16 100644 --- a/docs/ss-migration-guide.md +++ b/docs/ss-migration-guide.md @@ -30,6 +30,11 @@ Please refer [Migration Guide: SQL, Datasets and DataFrame](sql-migration-guide. - In Spark 3.0 and before, for the queries that have stateful operation which can emit rows older than the current watermark plus allowed late record delay, which are "late rows" in downstream stateful operations and these rows can be discarded, Spark only prints a warning message. Since Spark 3.1, Spark will check for such queries with possible correctness issue and throw AnalysisException for it by default. For the users who understand the possible risk of correctness issue and still decide to run the query, please disable this check by setting the config `spark.sql.streaming.statefulOperator.checkCorrectness.enabled` to false. +- In Spark 3.0 and before Spark uses `KafkaConsumer` for offset fetching which could cause infinite wait in the driver. + In Spark 3.1 a new configuration option added `spark.sql.streaming.kafka.useDeprecatedOffsetFetching` (default: `true`) + which could be set to `false` allowing Spark to use new offset fetching mechanism using `AdminClient`. + For further details please see [Structured Streaming Kafka Integration](structured-streaming-kafka-integration.html#offset-fetching). + ## Upgrading from Structured Streaming 2.4 to 3.0 - In Spark 3.0, Structured Streaming forces the source schema into nullable when file-based datasources such as text, json, csv, parquet and orc are used via `spark.readStream(...)`. Previously, it respected the nullability in source schema; however, it caused issues tricky to debug with NPE. To restore the previous behavior, set `spark.sql.streaming.fileSource.schema.forceNullable` to `false`. diff --git a/docs/structured-streaming-kafka-integration.md b/docs/structured-streaming-kafka-integration.md index 0e4d167b58d6b..f92dd039d53b7 100644 --- a/docs/structured-streaming-kafka-integration.md +++ b/docs/structured-streaming-kafka-integration.md @@ -512,6 +512,26 @@ The following configurations are optional: +### Offset fetching + +In Spark 3.0 and before Spark uses KafkaConsumer for offset fetching which could cause infinite wait in the driver. +In Spark 3.1 a new configuration option added spark.sql.streaming.kafka.useDeprecatedOffsetFetching (default: true) +which could be set to `false` allowing Spark to use new offset fetching mechanism using AdminClient. +When the new mechanism used the following applies. + +First of all the new approach supports Kafka brokers `0.11.0.0+`. + +In Spark 3.0 and below, secure Kafka processing needed the following ACLs from driver perspective: +* Topic resource describe operation +* Topic resource read operation +* Group resource read operation + +Since Spark 3.1, offsets can be obtained with AdminClient instead of KafkaConsumer and for that the following ACLs needed from driver perspective: +* Topic resource describe operation + +Since AdminClient in driver is not connecting to consumer group, group.id based authorization will not work anymore (executors never done group based authorization). +Worth to mention executor side is behaving the exact same way like before (group prefix and override works). + ### Consumer Caching It's time-consuming to initialize Kafka consumers, especially in streaming scenarios where processing time is a key factor. diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/ConsumerStrategy.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/ConsumerStrategy.scala index 7bb829c282eba..a0331d7889e04 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/ConsumerStrategy.scala +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/ConsumerStrategy.scala @@ -20,12 +20,15 @@ package org.apache.spark.sql.kafka010 import java.{util => ju} import scala.collection.JavaConverters._ +import scala.collection.mutable +import org.apache.kafka.clients.admin.Admin import org.apache.kafka.clients.consumer.{Consumer, KafkaConsumer} import org.apache.kafka.clients.consumer.internals.NoOpConsumerRebalanceListener import org.apache.kafka.common.TopicPartition -import org.apache.spark.kafka010.KafkaConfigUpdater +import org.apache.spark.internal.Logging +import org.apache.spark.kafka010.{KafkaConfigUpdater, KafkaRedactionUtil} /** * Subscribe allows you to subscribe to a fixed collection of topics. @@ -36,10 +39,20 @@ import org.apache.spark.kafka010.KafkaConfigUpdater * All three strategies have overloaded constructors that allow you to specify * the starting offset for a particular partition. */ -private[kafka010] sealed trait ConsumerStrategy { +private[kafka010] sealed trait ConsumerStrategy extends Logging { /** Create a [[KafkaConsumer]] and subscribe to topics according to a desired strategy */ def createConsumer(kafkaParams: ju.Map[String, Object]): Consumer[Array[Byte], Array[Byte]] + /** Creates an [[org.apache.kafka.clients.admin.AdminClient]] */ + def createAdmin(kafkaParams: ju.Map[String, Object]): Admin = { + val updatedKafkaParams = setAuthenticationConfigIfNeeded(kafkaParams) + logDebug(s"Admin params: ${KafkaRedactionUtil.redactParams(updatedKafkaParams.asScala.toSeq)}") + Admin.create(updatedKafkaParams) + } + + /** Returns the assigned or subscribed [[TopicPartition]] */ + def assignedTopicPartitions(admin: Admin): Set[TopicPartition] + /** * Updates the parameters with security if needed. * Added a function to hide internals and reduce code duplications because all strategy uses it. @@ -48,13 +61,24 @@ private[kafka010] sealed trait ConsumerStrategy { KafkaConfigUpdater("source", kafkaParams.asScala.toMap) .setAuthenticationConfigIfNeeded() .build() + + protected def retrieveAllPartitions(admin: Admin, topics: Set[String]): Set[TopicPartition] = { + admin.describeTopics(topics.asJava).all().get().asScala.filterNot(_._2.isInternal).flatMap { + case (topic, topicDescription) => + topicDescription.partitions().asScala.map { topicPartitionInfo => + val partition = topicPartitionInfo.partition() + logDebug(s"Partition found: $topic:$partition") + new TopicPartition(topic, partition) + } + }.toSet + } } /** * Specify a fixed collection of partitions. */ private[kafka010] case class AssignStrategy(partitions: Array[TopicPartition]) - extends ConsumerStrategy { + extends ConsumerStrategy with Logging { override def createConsumer( kafkaParams: ju.Map[String, Object]): Consumer[Array[Byte], Array[Byte]] = { val updatedKafkaParams = setAuthenticationConfigIfNeeded(kafkaParams) @@ -63,13 +87,20 @@ private[kafka010] case class AssignStrategy(partitions: Array[TopicPartition]) consumer } + override def assignedTopicPartitions(admin: Admin): Set[TopicPartition] = { + val topics = partitions.map(_.topic()).toSet + logDebug(s"Topics for assignment: $topics") + retrieveAllPartitions(admin, topics).filter(partitions.contains(_)) + } + override def toString: String = s"Assign[${partitions.mkString(", ")}]" } /** * Subscribe to a fixed collection of topics. */ -private[kafka010] case class SubscribeStrategy(topics: Seq[String]) extends ConsumerStrategy { +private[kafka010] case class SubscribeStrategy(topics: Seq[String]) + extends ConsumerStrategy with Logging { override def createConsumer( kafkaParams: ju.Map[String, Object]): Consumer[Array[Byte], Array[Byte]] = { val updatedKafkaParams = setAuthenticationConfigIfNeeded(kafkaParams) @@ -78,6 +109,10 @@ private[kafka010] case class SubscribeStrategy(topics: Seq[String]) extends Cons consumer } + override def assignedTopicPartitions(admin: Admin): Set[TopicPartition] = { + retrieveAllPartitions(admin, topics.toSet) + } + override def toString: String = s"Subscribe[${topics.mkString(", ")}]" } @@ -85,16 +120,30 @@ private[kafka010] case class SubscribeStrategy(topics: Seq[String]) extends Cons * Use a regex to specify topics of interest. */ private[kafka010] case class SubscribePatternStrategy(topicPattern: String) - extends ConsumerStrategy { + extends ConsumerStrategy with Logging { + private val topicRegex = topicPattern.r + override def createConsumer( kafkaParams: ju.Map[String, Object]): Consumer[Array[Byte], Array[Byte]] = { val updatedKafkaParams = setAuthenticationConfigIfNeeded(kafkaParams) val consumer = new KafkaConsumer[Array[Byte], Array[Byte]](updatedKafkaParams) - consumer.subscribe( - ju.regex.Pattern.compile(topicPattern), - new NoOpConsumerRebalanceListener()) + consumer.subscribe(ju.regex.Pattern.compile(topicPattern), new NoOpConsumerRebalanceListener()) consumer } + override def assignedTopicPartitions(admin: Admin): Set[TopicPartition] = { + logDebug(s"Topic pattern: $topicPattern") + var topics = mutable.Seq.empty[String] + // listTopics is not listing internal topics by default so no filter needed + admin.listTopics().listings().get().asScala.foreach { topicListing => + val name = topicListing.name() + if (topicRegex.findFirstIn(name).isDefined) { + logDebug(s"Topic matches pattern: $name") + topics :+= name + } + } + retrieveAllPartitions(admin, topics.toSet) + } + override def toString: String = s"SubscribePattern[$topicPattern]" } diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatch.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatch.scala index a1b0f7d22216b..268719d6aed2c 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatch.scala +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatch.scala @@ -48,7 +48,7 @@ private[kafka010] class KafkaBatch( // id. Hence, we should generate a unique id for each query. val uniqueGroupId = KafkaSourceProvider.batchUniqueGroupId(sourceOptions) - val kafkaOffsetReader = new KafkaOffsetReader( + val kafkaOffsetReader = KafkaOffsetReader.build( strategy, KafkaSourceProvider.kafkaParamsForDriver(specifiedKafkaParams), sourceOptions, diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala index adcc20c25cb5f..b1992c1dc6a0a 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala @@ -19,595 +19,62 @@ package org.apache.spark.sql.kafka010 import java.{util => ju} -import scala.collection.JavaConverters._ -import scala.collection.mutable.ArrayBuffer -import scala.util.control.NonFatal - -import org.apache.kafka.clients.consumer.{Consumer, ConsumerConfig, OffsetAndTimestamp} import org.apache.kafka.common.TopicPartition -import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging -import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap -import org.apache.spark.util.{UninterruptibleThread, UninterruptibleThreadRunner} +import org.apache.spark.sql.internal.SQLConf /** - * This class uses Kafka's own [[org.apache.kafka.clients.consumer.KafkaConsumer]] API to - * read data offsets from Kafka. - * The [[ConsumerStrategy]] class defines which Kafka topics and partitions should be read - * by this source. These strategies directly correspond to the different consumption options - * in. This class is designed to return a configured - * [[org.apache.kafka.clients.consumer.KafkaConsumer]] that is used by the - * [[KafkaSource]] to query for the offsets. See the docs on - * [[org.apache.spark.sql.kafka010.ConsumerStrategy]] - * for more details. - * - * Note: This class is not ThreadSafe + * Base trait to fetch offsets from Kafka. The implementations are + * [[KafkaOffsetReaderConsumer]] and [[KafkaOffsetReaderAdmin]]. + * Please see the documentation and API description there. */ -private[kafka010] class KafkaOffsetReader( - consumerStrategy: ConsumerStrategy, - val driverKafkaParams: ju.Map[String, Object], - readerOptions: CaseInsensitiveMap[String], - driverGroupIdPrefix: String) extends Logging { - - /** - * [[UninterruptibleThreadRunner]] ensures that all - * [[org.apache.kafka.clients.consumer.KafkaConsumer]] communication called in an - * [[UninterruptibleThread]]. In the case of streaming queries, we are already running in an - * [[UninterruptibleThread]], however for batch mode this is not the case. - */ - val uninterruptibleThreadRunner = new UninterruptibleThreadRunner("Kafka Offset Reader") - - /** - * Place [[groupId]] and [[nextId]] here so that they are initialized before any consumer is - * created -- see SPARK-19564. - */ - private var groupId: String = null - private var nextId = 0 +private[kafka010] trait KafkaOffsetReader { - /** - * A KafkaConsumer used in the driver to query the latest Kafka offsets. This only queries the - * offsets and never commits them. - */ - @volatile protected var _consumer: Consumer[Array[Byte], Array[Byte]] = null - - protected def consumer: Consumer[Array[Byte], Array[Byte]] = synchronized { - assert(Thread.currentThread().isInstanceOf[UninterruptibleThread]) - if (_consumer == null) { - val newKafkaParams = new ju.HashMap[String, Object](driverKafkaParams) - if (driverKafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG) == null) { - newKafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, nextGroupId()) - } - _consumer = consumerStrategy.createConsumer(newKafkaParams) - } - _consumer - } + // These are needed here because of KafkaSourceProviderSuite + private[kafka010] val maxOffsetFetchAttempts: Int + private[kafka010] val offsetFetchAttemptIntervalMs: Long - private[kafka010] val maxOffsetFetchAttempts = - readerOptions.getOrElse(KafkaSourceProvider.FETCH_OFFSET_NUM_RETRY, "3").toInt + // This is needed here because of KafkaContinuousStream + val driverKafkaParams: ju.Map[String, Object] - /** - * Number of partitions to read from Kafka. If this value is greater than the number of Kafka - * topicPartitions, we will split up the read tasks of the skewed partitions to multiple Spark - * tasks. The number of Spark tasks will be *approximately* `numPartitions`. It can be less or - * more depending on rounding errors or Kafka partitions that didn't receive any new data. - */ - private val minPartitions = - readerOptions.get(KafkaSourceProvider.MIN_PARTITIONS_OPTION_KEY).map(_.toInt) - - private val rangeCalculator = new KafkaOffsetRangeCalculator(minPartitions) - - private[kafka010] val offsetFetchAttemptIntervalMs = - readerOptions.getOrElse(KafkaSourceProvider.FETCH_OFFSET_RETRY_INTERVAL_MS, "1000").toLong - - /** - * Whether we should divide Kafka TopicPartitions with a lot of data into smaller Spark tasks. - */ - private def shouldDivvyUpLargePartitions(numTopicPartitions: Int): Boolean = { - minPartitions.map(_ > numTopicPartitions).getOrElse(false) - } - - private def nextGroupId(): String = { - groupId = driverGroupIdPrefix + "-" + nextId - nextId += 1 - groupId - } - - override def toString(): String = consumerStrategy.toString - - /** - * Closes the connection to Kafka, and cleans up state. - */ - def close(): Unit = { - if (_consumer != null) uninterruptibleThreadRunner.runUninterruptibly { stopConsumer() } - uninterruptibleThreadRunner.shutdown() - } - - /** - * @return The Set of TopicPartitions for a given topic - */ - def fetchTopicPartitions(): Set[TopicPartition] = uninterruptibleThreadRunner.runUninterruptibly { - assert(Thread.currentThread().isInstanceOf[UninterruptibleThread]) - // Poll to get the latest assigned partitions - consumer.poll(0) - val partitions = consumer.assignment() - consumer.pause(partitions) - partitions.asScala.toSet - } - - /** - * Fetch the partition offsets for the topic partitions that are indicated - * in the [[ConsumerStrategy]] and [[KafkaOffsetRangeLimit]]. - */ + def close(): Unit def fetchPartitionOffsets( offsetRangeLimit: KafkaOffsetRangeLimit, - isStartingOffsets: Boolean): Map[TopicPartition, Long] = { - def validateTopicPartitions(partitions: Set[TopicPartition], - partitionOffsets: Map[TopicPartition, Long]): Map[TopicPartition, Long] = { - assert(partitions == partitionOffsets.keySet, - "If startingOffsets contains specific offsets, you must specify all TopicPartitions.\n" + - "Use -1 for latest, -2 for earliest.\n" + - s"Specified: ${partitionOffsets.keySet} Assigned: ${partitions}") - logDebug(s"Partitions assigned to consumer: $partitions. Seeking to $partitionOffsets") - partitionOffsets - } - val partitions = fetchTopicPartitions() - // Obtain TopicPartition offsets with late binding support - offsetRangeLimit match { - case EarliestOffsetRangeLimit => partitions.map { - case tp => tp -> KafkaOffsetRangeLimit.EARLIEST - }.toMap - case LatestOffsetRangeLimit => partitions.map { - case tp => tp -> KafkaOffsetRangeLimit.LATEST - }.toMap - case SpecificOffsetRangeLimit(partitionOffsets) => - validateTopicPartitions(partitions, partitionOffsets) - case SpecificTimestampRangeLimit(partitionTimestamps) => - fetchSpecificTimestampBasedOffsets(partitionTimestamps, - failsOnNoMatchingOffset = isStartingOffsets).partitionToOffsets - } - } - - /** - * Resolves the specific offsets based on Kafka seek positions. - * This method resolves offset value -1 to the latest and -2 to the - * earliest Kafka seek position. - * - * @param partitionOffsets the specific offsets to resolve - * @param reportDataLoss callback to either report or log data loss depending on setting - */ + isStartingOffsets: Boolean): Map[TopicPartition, Long] def fetchSpecificOffsets( partitionOffsets: Map[TopicPartition, Long], - reportDataLoss: String => Unit): KafkaSourceOffset = { - val fnAssertParametersWithPartitions: ju.Set[TopicPartition] => Unit = { partitions => - assert(partitions.asScala == partitionOffsets.keySet, - "If startingOffsets contains specific offsets, you must specify all TopicPartitions.\n" + - "Use -1 for latest, -2 for earliest, if you don't care.\n" + - s"Specified: ${partitionOffsets.keySet} Assigned: ${partitions.asScala}") - logDebug(s"Partitions assigned to consumer: $partitions. Seeking to $partitionOffsets") - } - - val fnRetrievePartitionOffsets: ju.Set[TopicPartition] => Map[TopicPartition, Long] = { _ => - partitionOffsets - } - - val fnAssertFetchedOffsets: Map[TopicPartition, Long] => Unit = { fetched => - partitionOffsets.foreach { - case (tp, off) if off != KafkaOffsetRangeLimit.LATEST && - off != KafkaOffsetRangeLimit.EARLIEST => - if (fetched(tp) != off) { - reportDataLoss( - s"startingOffsets for $tp was $off but consumer reset to ${fetched(tp)}") - } - case _ => - // no real way to check that beginning or end is reasonable - } - } - - fetchSpecificOffsets0(fnAssertParametersWithPartitions, fnRetrievePartitionOffsets, - fnAssertFetchedOffsets) - } - + reportDataLoss: String => Unit): KafkaSourceOffset def fetchSpecificTimestampBasedOffsets( partitionTimestamps: Map[TopicPartition, Long], - failsOnNoMatchingOffset: Boolean): KafkaSourceOffset = { - val fnAssertParametersWithPartitions: ju.Set[TopicPartition] => Unit = { partitions => - assert(partitions.asScala == partitionTimestamps.keySet, - "If starting/endingOffsetsByTimestamp contains specific offsets, you must specify all " + - s"topics. Specified: ${partitionTimestamps.keySet} Assigned: ${partitions.asScala}") - logDebug(s"Partitions assigned to consumer: $partitions. Seeking to $partitionTimestamps") - } - - val fnRetrievePartitionOffsets: ju.Set[TopicPartition] => Map[TopicPartition, Long] = { _ => { - val converted = partitionTimestamps.map { case (tp, timestamp) => - tp -> java.lang.Long.valueOf(timestamp) - }.asJava - - val offsetForTime: ju.Map[TopicPartition, OffsetAndTimestamp] = - consumer.offsetsForTimes(converted) - - offsetForTime.asScala.map { case (tp, offsetAndTimestamp) => - if (failsOnNoMatchingOffset) { - assert(offsetAndTimestamp != null, "No offset matched from request of " + - s"topic-partition $tp and timestamp ${partitionTimestamps(tp)}.") - } - - if (offsetAndTimestamp == null) { - tp -> KafkaOffsetRangeLimit.LATEST - } else { - tp -> offsetAndTimestamp.offset() - } - }.toMap - } - } - - val fnAssertFetchedOffsets: Map[TopicPartition, Long] => Unit = { _ => } - - fetchSpecificOffsets0(fnAssertParametersWithPartitions, fnRetrievePartitionOffsets, - fnAssertFetchedOffsets) - } - - private def fetchSpecificOffsets0( - fnAssertParametersWithPartitions: ju.Set[TopicPartition] => Unit, - fnRetrievePartitionOffsets: ju.Set[TopicPartition] => Map[TopicPartition, Long], - fnAssertFetchedOffsets: Map[TopicPartition, Long] => Unit): KafkaSourceOffset = { - val fetched = partitionsAssignedToConsumer { - partitions => { - fnAssertParametersWithPartitions(partitions) - - val partitionOffsets = fnRetrievePartitionOffsets(partitions) - - partitionOffsets.foreach { - case (tp, KafkaOffsetRangeLimit.LATEST) => - consumer.seekToEnd(ju.Arrays.asList(tp)) - case (tp, KafkaOffsetRangeLimit.EARLIEST) => - consumer.seekToBeginning(ju.Arrays.asList(tp)) - case (tp, off) => consumer.seek(tp, off) - } - - partitionOffsets.map { - case (tp, _) => tp -> consumer.position(tp) - } - } - } - - fnAssertFetchedOffsets(fetched) - - KafkaSourceOffset(fetched) - } - - /** - * Fetch the earliest offsets for the topic partitions that are indicated - * in the [[ConsumerStrategy]]. - */ - def fetchEarliestOffsets(): Map[TopicPartition, Long] = partitionsAssignedToConsumer( - partitions => { - logDebug("Seeking to the beginning") - - consumer.seekToBeginning(partitions) - val partitionOffsets = partitions.asScala.map(p => p -> consumer.position(p)).toMap - logDebug(s"Got earliest offsets for partition : $partitionOffsets") - partitionOffsets - }, fetchingEarliestOffset = true) - - /** - * Fetch the latest offsets for the topic partitions that are indicated - * in the [[ConsumerStrategy]]. - * - * Kafka may return earliest offsets when we are requesting latest offsets if `poll` is called - * right before `seekToEnd` (KAFKA-7703). As a workaround, we will call `position` right after - * `poll` to wait until the potential offset request triggered by `poll(0)` is done. - * - * In addition, to avoid other unknown issues, we also use the given `knownOffsets` to audit the - * latest offsets returned by Kafka. If we find some incorrect offsets (a latest offset is less - * than an offset in `knownOffsets`), we will retry at most `maxOffsetFetchAttempts` times. When - * a topic is recreated, the latest offsets may be less than offsets in `knownOffsets`. We cannot - * distinguish this with KAFKA-7703, so we just return whatever we get from Kafka after retrying. - */ - def fetchLatestOffsets( - knownOffsets: Option[PartitionOffsetMap]): PartitionOffsetMap = - partitionsAssignedToConsumer { partitions => { - logDebug("Seeking to the end.") - - if (knownOffsets.isEmpty) { - consumer.seekToEnd(partitions) - partitions.asScala.map(p => p -> consumer.position(p)).toMap - } else { - var partitionOffsets: PartitionOffsetMap = Map.empty - - /** - * Compare `knownOffsets` and `partitionOffsets`. Returns all partitions that have incorrect - * latest offset (offset in `knownOffsets` is great than the one in `partitionOffsets`). - */ - def findIncorrectOffsets(): Seq[(TopicPartition, Long, Long)] = { - var incorrectOffsets = ArrayBuffer[(TopicPartition, Long, Long)]() - partitionOffsets.foreach { case (tp, offset) => - knownOffsets.foreach(_.get(tp).foreach { knownOffset => - if (knownOffset > offset) { - val incorrectOffset = (tp, knownOffset, offset) - incorrectOffsets += incorrectOffset - } - }) - } - incorrectOffsets.toSeq - } - - // Retry to fetch latest offsets when detecting incorrect offsets. We don't use - // `withRetriesWithoutInterrupt` to retry because: - // - // - `withRetriesWithoutInterrupt` will reset the consumer for each attempt but a fresh - // consumer has a much bigger chance to hit KAFKA-7703. - // - Avoid calling `consumer.poll(0)` which may cause KAFKA-7703. - var incorrectOffsets: Seq[(TopicPartition, Long, Long)] = Nil - var attempt = 0 - do { - consumer.seekToEnd(partitions) - partitionOffsets = partitions.asScala.map(p => p -> consumer.position(p)).toMap - attempt += 1 - - incorrectOffsets = findIncorrectOffsets() - if (incorrectOffsets.nonEmpty) { - logWarning("Found incorrect offsets in some partitions " + - s"(partition, previous offset, fetched offset): $incorrectOffsets") - if (attempt < maxOffsetFetchAttempts) { - logWarning("Retrying to fetch latest offsets because of incorrect offsets") - Thread.sleep(offsetFetchAttemptIntervalMs) - } - } - } while (incorrectOffsets.nonEmpty && attempt < maxOffsetFetchAttempts) - - logDebug(s"Got latest offsets for partition : $partitionOffsets") - partitionOffsets - } - } - } - - /** - * Fetch the earliest offsets for specific topic partitions. - * The return result may not contain some partitions if they are deleted. - */ - def fetchEarliestOffsets( - newPartitions: Seq[TopicPartition]): Map[TopicPartition, Long] = { - if (newPartitions.isEmpty) { - Map.empty[TopicPartition, Long] - } else { - partitionsAssignedToConsumer(partitions => { - // Get the earliest offset of each partition - consumer.seekToBeginning(partitions) - val partitionOffsets = newPartitions.filter { p => - // When deleting topics happen at the same time, some partitions may not be in - // `partitions`. So we need to ignore them - partitions.contains(p) - }.map(p => p -> consumer.position(p)).toMap - logDebug(s"Got earliest offsets for new partitions: $partitionOffsets") - partitionOffsets - }, fetchingEarliestOffset = true) - } - } - - /** - * Return the offset ranges for a Kafka batch query. If `minPartitions` is set, this method may - * split partitions to respect it. Since offsets can be early and late binding which are evaluated - * on the executors, in order to divvy up the partitions we need to perform some substitutions. We - * don't want to send exact offsets to the executors, because data may age out before we can - * consume the data. This method makes some approximate splitting, and replaces the special offset - * values in the final output. - */ + failsOnNoMatchingOffset: Boolean): KafkaSourceOffset + def fetchEarliestOffsets(): Map[TopicPartition, Long] + def fetchLatestOffsets(knownOffsets: Option[PartitionOffsetMap]): PartitionOffsetMap + def fetchEarliestOffsets(newPartitions: Seq[TopicPartition]): Map[TopicPartition, Long] def getOffsetRangesFromUnresolvedOffsets( startingOffsets: KafkaOffsetRangeLimit, - endingOffsets: KafkaOffsetRangeLimit): Seq[KafkaOffsetRange] = { - val fromPartitionOffsets = fetchPartitionOffsets(startingOffsets, isStartingOffsets = true) - val untilPartitionOffsets = fetchPartitionOffsets(endingOffsets, isStartingOffsets = false) - - // Obtain topicPartitions in both from and until partition offset, ignoring - // topic partitions that were added and/or deleted between the two above calls. - if (fromPartitionOffsets.keySet != untilPartitionOffsets.keySet) { - implicit val topicOrdering: Ordering[TopicPartition] = Ordering.by(t => t.topic()) - val fromTopics = fromPartitionOffsets.keySet.toList.sorted.mkString(",") - val untilTopics = untilPartitionOffsets.keySet.toList.sorted.mkString(",") - throw new IllegalStateException("different topic partitions " + - s"for starting offsets topics[${fromTopics}] and " + - s"ending offsets topics[${untilTopics}]") - } - - // Calculate offset ranges - val offsetRangesBase = untilPartitionOffsets.keySet.map { tp => - val fromOffset = fromPartitionOffsets.get(tp).getOrElse { - // This should not happen since topicPartitions contains all partitions not in - // fromPartitionOffsets - throw new IllegalStateException(s"$tp doesn't have a from offset") - } - val untilOffset = untilPartitionOffsets(tp) - KafkaOffsetRange(tp, fromOffset, untilOffset, None) - }.toSeq - - if (shouldDivvyUpLargePartitions(offsetRangesBase.size)) { - val fromOffsetsMap = - offsetRangesBase.map(range => (range.topicPartition, range.fromOffset)).toMap - val untilOffsetsMap = - offsetRangesBase.map(range => (range.topicPartition, range.untilOffset)).toMap - - // No need to report data loss here - val resolvedFromOffsets = fetchSpecificOffsets(fromOffsetsMap, _ => ()).partitionToOffsets - val resolvedUntilOffsets = fetchSpecificOffsets(untilOffsetsMap, _ => ()).partitionToOffsets - val ranges = offsetRangesBase.map(_.topicPartition).map { tp => - KafkaOffsetRange(tp, resolvedFromOffsets(tp), resolvedUntilOffsets(tp), preferredLoc = None) - } - val divvied = rangeCalculator.getRanges(ranges).groupBy(_.topicPartition) - divvied.flatMap { case (tp, splitOffsetRanges) => - if (splitOffsetRanges.length == 1) { - Seq(KafkaOffsetRange(tp, fromOffsetsMap(tp), untilOffsetsMap(tp), None)) - } else { - // the list can't be empty - val first = splitOffsetRanges.head.copy(fromOffset = fromOffsetsMap(tp)) - val end = splitOffsetRanges.last.copy(untilOffset = untilOffsetsMap(tp)) - Seq(first) ++ splitOffsetRanges.drop(1).dropRight(1) :+ end - } - }.toArray.toSeq - } else { - offsetRangesBase - } - } - - private def getSortedExecutorList(): Array[String] = { - def compare(a: ExecutorCacheTaskLocation, b: ExecutorCacheTaskLocation): Boolean = { - if (a.host == b.host) { - a.executorId > b.executorId - } else { - a.host > b.host - } - } - - val bm = SparkEnv.get.blockManager - bm.master.getPeers(bm.blockManagerId).toArray - .map(x => ExecutorCacheTaskLocation(x.host, x.executorId)) - .sortWith(compare) - .map(_.toString) - } - - /** - * Return the offset ranges for a Kafka streaming batch. If `minPartitions` is set, this method - * may split partitions to respect it. If any data lost issue is detected, `reportDataLoss` will - * be called. - */ + endingOffsets: KafkaOffsetRangeLimit): Seq[KafkaOffsetRange] def getOffsetRangesFromResolvedOffsets( fromPartitionOffsets: PartitionOffsetMap, untilPartitionOffsets: PartitionOffsetMap, - reportDataLoss: String => Unit): Seq[KafkaOffsetRange] = { - // Find the new partitions, and get their earliest offsets - val newPartitions = untilPartitionOffsets.keySet.diff(fromPartitionOffsets.keySet) - val newPartitionInitialOffsets = fetchEarliestOffsets(newPartitions.toSeq) - if (newPartitionInitialOffsets.keySet != newPartitions) { - // We cannot get from offsets for some partitions. It means they got deleted. - val deletedPartitions = newPartitions.diff(newPartitionInitialOffsets.keySet) - reportDataLoss( - s"Cannot find earliest offsets of ${deletedPartitions}. Some data may have been missed") - } - logInfo(s"Partitions added: $newPartitionInitialOffsets") - newPartitionInitialOffsets.filter(_._2 != 0).foreach { case (p, o) => - reportDataLoss( - s"Added partition $p starts from $o instead of 0. Some data may have been missed") - } - - val deletedPartitions = fromPartitionOffsets.keySet.diff(untilPartitionOffsets.keySet) - if (deletedPartitions.nonEmpty) { - val message = if (driverKafkaParams.containsKey(ConsumerConfig.GROUP_ID_CONFIG)) { - s"$deletedPartitions are gone. ${KafkaSourceProvider.CUSTOM_GROUP_ID_ERROR_MESSAGE}" - } else { - s"$deletedPartitions are gone. Some data may have been missed." - } - reportDataLoss(message) - } - - // Use the until partitions to calculate offset ranges to ignore partitions that have - // been deleted - val topicPartitions = untilPartitionOffsets.keySet.filter { tp => - // Ignore partitions that we don't know the from offsets. - newPartitionInitialOffsets.contains(tp) || fromPartitionOffsets.contains(tp) - }.toSeq - logDebug("TopicPartitions: " + topicPartitions.mkString(", ")) - - val fromOffsets = fromPartitionOffsets ++ newPartitionInitialOffsets - val untilOffsets = untilPartitionOffsets - val ranges = topicPartitions.map { tp => - val fromOffset = fromOffsets(tp) - val untilOffset = untilOffsets(tp) - if (untilOffset < fromOffset) { - reportDataLoss(s"Partition $tp's offset was changed from " + - s"$fromOffset to $untilOffset, some data may have been missed") - } - KafkaOffsetRange(tp, fromOffset, untilOffset, preferredLoc = None) - } - rangeCalculator.getRanges(ranges, getSortedExecutorList) - } - - private def partitionsAssignedToConsumer( - body: ju.Set[TopicPartition] => Map[TopicPartition, Long], - fetchingEarliestOffset: Boolean = false) - : Map[TopicPartition, Long] = uninterruptibleThreadRunner.runUninterruptibly { - - withRetriesWithoutInterrupt { - // Poll to get the latest assigned partitions - consumer.poll(0) - val partitions = consumer.assignment() - - if (!fetchingEarliestOffset) { - // Call `position` to wait until the potential offset request triggered by `poll(0)` is - // done. This is a workaround for KAFKA-7703, which an async `seekToBeginning` triggered by - // `poll(0)` may reset offsets that should have been set by another request. - partitions.asScala.map(p => p -> consumer.position(p)).foreach(_ => {}) - } - - consumer.pause(partitions) - logDebug(s"Partitions assigned to consumer: $partitions.") - body(partitions) - } - } - - /** - * Helper function that does multiple retries on a body of code that returns offsets. - * Retries are needed to handle transient failures. For e.g. race conditions between getting - * assignment and getting position while topics/partitions are deleted can cause NPEs. - * - * This method also makes sure `body` won't be interrupted to workaround a potential issue in - * `KafkaConsumer.poll`. (KAFKA-1894) - */ - private def withRetriesWithoutInterrupt( - body: => Map[TopicPartition, Long]): Map[TopicPartition, Long] = { - // Make sure `KafkaConsumer.poll` won't be interrupted (KAFKA-1894) - assert(Thread.currentThread().isInstanceOf[UninterruptibleThread]) + reportDataLoss: String => Unit): Seq[KafkaOffsetRange] +} - synchronized { - var result: Option[Map[TopicPartition, Long]] = None - var attempt = 1 - var lastException: Throwable = null - while (result.isEmpty && attempt <= maxOffsetFetchAttempts - && !Thread.currentThread().isInterrupted) { - Thread.currentThread match { - case ut: UninterruptibleThread => - // "KafkaConsumer.poll" may hang forever if the thread is interrupted (E.g., the query - // is stopped)(KAFKA-1894). Hence, we just make sure we don't interrupt it. - // - // If the broker addresses are wrong, or Kafka cluster is down, "KafkaConsumer.poll" may - // hang forever as well. This cannot be resolved in KafkaSource until Kafka fixes the - // issue. - ut.runUninterruptibly { - try { - result = Some(body) - } catch { - case NonFatal(e) => - lastException = e - logWarning(s"Error in attempt $attempt getting Kafka offsets: ", e) - attempt += 1 - Thread.sleep(offsetFetchAttemptIntervalMs) - resetConsumer() - } - } - case _ => - throw new IllegalStateException( - "Kafka APIs must be executed on a o.a.spark.util.UninterruptibleThread") - } - } - if (Thread.interrupted()) { - throw new InterruptedException() - } - if (result.isEmpty) { - assert(attempt > maxOffsetFetchAttempts) - assert(lastException != null) - throw lastException - } - result.get +private[kafka010] object KafkaOffsetReader extends Logging { + def build( + consumerStrategy: ConsumerStrategy, + driverKafkaParams: ju.Map[String, Object], + readerOptions: CaseInsensitiveMap[String], + driverGroupIdPrefix: String): KafkaOffsetReader = { + if (SQLConf.get.useDeprecatedKafkaOffsetFetching) { + logDebug("Creating old and deprecated Consumer based offset reader") + new KafkaOffsetReaderConsumer(consumerStrategy, driverKafkaParams, readerOptions, + driverGroupIdPrefix) + } else { + logDebug("Creating new Admin based offset reader") + new KafkaOffsetReaderAdmin(consumerStrategy, driverKafkaParams, readerOptions, + driverGroupIdPrefix) } } - - private def stopConsumer(): Unit = synchronized { - assert(Thread.currentThread().isInstanceOf[UninterruptibleThread]) - if (_consumer != null) _consumer.close() - } - - private def resetConsumer(): Unit = synchronized { - stopConsumer() - _consumer = null // will automatically get reinitialized again - } } diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderAdmin.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderAdmin.scala new file mode 100644 index 0000000000000..d5905795c626b --- /dev/null +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderAdmin.scala @@ -0,0 +1,573 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.kafka010 + +import java.{util => ju} +import java.util.Locale + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer +import scala.util.control.NonFatal + +import org.apache.kafka.clients.admin.{Admin, ListOffsetsOptions, OffsetSpec} +import org.apache.kafka.clients.consumer.ConsumerConfig +import org.apache.kafka.common.{IsolationLevel, TopicPartition} +import org.apache.kafka.common.requests.OffsetFetchResponse + +import org.apache.spark.SparkEnv +import org.apache.spark.internal.Logging +import org.apache.spark.scheduler.ExecutorCacheTaskLocation +import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.util.{UninterruptibleThread, UninterruptibleThreadRunner} + +/** + * This class uses Kafka's own [[Admin]] API to read data offsets from Kafka. + * The [[ConsumerStrategy]] class defines which Kafka topics and partitions should be read + * by this source. These strategies directly correspond to the different consumption options + * in. This class is designed to return a configured [[Admin]] that is used by the + * [[KafkaSource]] to query for the offsets. See the docs on + * [[org.apache.spark.sql.kafka010.ConsumerStrategy]] + * for more details. + * + * Note: This class is not ThreadSafe + */ +private[kafka010] class KafkaOffsetReaderAdmin( + consumerStrategy: ConsumerStrategy, + override val driverKafkaParams: ju.Map[String, Object], + readerOptions: CaseInsensitiveMap[String], + driverGroupIdPrefix: String) extends KafkaOffsetReader with Logging { + + private[kafka010] val maxOffsetFetchAttempts = + readerOptions.getOrElse(KafkaSourceProvider.FETCH_OFFSET_NUM_RETRY, "3").toInt + + private[kafka010] val offsetFetchAttemptIntervalMs = + readerOptions.getOrElse(KafkaSourceProvider.FETCH_OFFSET_RETRY_INTERVAL_MS, "1000").toLong + + /** + * [[UninterruptibleThreadRunner]] ensures that all [[Admin]] communication called in an + * [[UninterruptibleThread]]. In the case of streaming queries, we are already running in an + * [[UninterruptibleThread]], however for batch mode this is not the case. + */ + val uninterruptibleThreadRunner = new UninterruptibleThreadRunner("Kafka Offset Reader") + + /** + * An AdminClient used in the driver to query the latest Kafka offsets. + * This only queries the offsets because AdminClient has no functionality to commit offsets like + * KafkaConsumer. + */ + @volatile protected var _admin: Admin = null + + protected def admin: Admin = synchronized { + assert(Thread.currentThread().isInstanceOf[UninterruptibleThread]) + if (_admin == null) { + _admin = consumerStrategy.createAdmin(driverKafkaParams) + } + _admin + } + + lazy val isolationLevel: IsolationLevel = { + Option(driverKafkaParams.get(ConsumerConfig.ISOLATION_LEVEL_CONFIG)) match { + case Some(s: String) => IsolationLevel.valueOf(s.toUpperCase(Locale.ROOT)) + case None => IsolationLevel.valueOf( + ConsumerConfig.DEFAULT_ISOLATION_LEVEL.toUpperCase(Locale.ROOT)) + case _ => throw new IllegalArgumentException(s"${ConsumerConfig.ISOLATION_LEVEL_CONFIG} " + + "must be either not defined or with type String") + } + } + + private lazy val listOffsetsOptions = new ListOffsetsOptions(isolationLevel) + + private def listOffsets(admin: Admin, listOffsetsParams: ju.Map[TopicPartition, OffsetSpec]) = { + admin.listOffsets(listOffsetsParams, listOffsetsOptions).all().get().asScala + .map(result => result._1 -> result._2.offset()).toMap + } + + /** + * Number of partitions to read from Kafka. If this value is greater than the number of Kafka + * topicPartitions, we will split up the read tasks of the skewed partitions to multiple Spark + * tasks. The number of Spark tasks will be *approximately* `numPartitions`. It can be less or + * more depending on rounding errors or Kafka partitions that didn't receive any new data. + */ + private val minPartitions = + readerOptions.get(KafkaSourceProvider.MIN_PARTITIONS_OPTION_KEY).map(_.toInt) + + private val rangeCalculator = new KafkaOffsetRangeCalculator(minPartitions) + + /** + * Whether we should divide Kafka TopicPartitions with a lot of data into smaller Spark tasks. + */ + private def shouldDivvyUpLargePartitions(numTopicPartitions: Int): Boolean = { + minPartitions.map(_ > numTopicPartitions).getOrElse(false) + } + + override def toString(): String = consumerStrategy.toString + + /** + * Closes the connection to Kafka, and cleans up state. + */ + override def close(): Unit = { + if (_admin != null) uninterruptibleThreadRunner.runUninterruptibly { stopAdmin() } + uninterruptibleThreadRunner.shutdown() + } + + /** + * Fetch the partition offsets for the topic partitions that are indicated + * in the [[ConsumerStrategy]] and [[KafkaOffsetRangeLimit]]. + */ + override def fetchPartitionOffsets( + offsetRangeLimit: KafkaOffsetRangeLimit, + isStartingOffsets: Boolean): Map[TopicPartition, Long] = { + def validateTopicPartitions(partitions: Set[TopicPartition], + partitionOffsets: Map[TopicPartition, Long]): Map[TopicPartition, Long] = { + assert(partitions == partitionOffsets.keySet, + "If startingOffsets contains specific offsets, you must specify all TopicPartitions.\n" + + "Use -1 for latest, -2 for earliest.\n" + + s"Specified: ${partitionOffsets.keySet} Assigned: ${partitions}") + logDebug(s"Assigned partitions: $partitions. Seeking to $partitionOffsets") + partitionOffsets + } + val partitions = uninterruptibleThreadRunner.runUninterruptibly { + consumerStrategy.assignedTopicPartitions(admin) + } + // Obtain TopicPartition offsets with late binding support + offsetRangeLimit match { + case EarliestOffsetRangeLimit => partitions.map { + case tp => tp -> KafkaOffsetRangeLimit.EARLIEST + }.toMap + case LatestOffsetRangeLimit => partitions.map { + case tp => tp -> KafkaOffsetRangeLimit.LATEST + }.toMap + case SpecificOffsetRangeLimit(partitionOffsets) => + validateTopicPartitions(partitions, partitionOffsets) + case SpecificTimestampRangeLimit(partitionTimestamps) => + fetchSpecificTimestampBasedOffsets(partitionTimestamps, + failsOnNoMatchingOffset = isStartingOffsets).partitionToOffsets + } + } + + /** + * Resolves the specific offsets based on Kafka seek positions. + * This method resolves offset value -1 to the latest and -2 to the + * earliest Kafka seek position. + * + * @param partitionOffsets the specific offsets to resolve + * @param reportDataLoss callback to either report or log data loss depending on setting + */ + override def fetchSpecificOffsets( + partitionOffsets: Map[TopicPartition, Long], + reportDataLoss: String => Unit): KafkaSourceOffset = { + val fnAssertParametersWithPartitions: ju.Set[TopicPartition] => Unit = { partitions => + assert(partitions.asScala == partitionOffsets.keySet, + "If startingOffsets contains specific offsets, you must specify all TopicPartitions.\n" + + "Use -1 for latest, -2 for earliest, if you don't care.\n" + + s"Specified: ${partitionOffsets.keySet} Assigned: ${partitions.asScala}") + logDebug(s"Assigned partitions: $partitions. Seeking to $partitionOffsets") + } + + val fnRetrievePartitionOffsets: ju.Set[TopicPartition] => Map[TopicPartition, Long] = { _ => + partitionOffsets + } + + fetchSpecificOffsets0(fnAssertParametersWithPartitions, fnRetrievePartitionOffsets) + } + + override def fetchSpecificTimestampBasedOffsets( + partitionTimestamps: Map[TopicPartition, Long], + failsOnNoMatchingOffset: Boolean): KafkaSourceOffset = { + val fnAssertParametersWithPartitions: ju.Set[TopicPartition] => Unit = { partitions => + assert(partitions.asScala == partitionTimestamps.keySet, + "If starting/endingOffsetsByTimestamp contains specific offsets, you must specify all " + + s"topics. Specified: ${partitionTimestamps.keySet} Assigned: ${partitions.asScala}") + logDebug(s"Assigned partitions: $partitions. Seeking to $partitionTimestamps") + } + + val fnRetrievePartitionOffsets: ju.Set[TopicPartition] => Map[TopicPartition, Long] = { _ => { + val listOffsetsParams = partitionTimestamps.map { case (tp, timestamp) => + tp -> OffsetSpec.forTimestamp(timestamp) + }.asJava + admin.listOffsets(listOffsetsParams, listOffsetsOptions).all().get().asScala.map { + case (tp, offsetSpec) => + if (failsOnNoMatchingOffset) { + assert(offsetSpec.offset() != OffsetFetchResponse.INVALID_OFFSET, "No offset " + + s"matched from request of topic-partition $tp and timestamp " + + s"${partitionTimestamps(tp)}.") + } + + if (offsetSpec.offset() == OffsetFetchResponse.INVALID_OFFSET) { + tp -> KafkaOffsetRangeLimit.LATEST + } else { + tp -> offsetSpec.offset() + } + }.toMap + } + } + + fetchSpecificOffsets0(fnAssertParametersWithPartitions, fnRetrievePartitionOffsets) + } + + private def fetchSpecificOffsets0( + fnAssertParametersWithPartitions: ju.Set[TopicPartition] => Unit, + fnRetrievePartitionOffsets: ju.Set[TopicPartition] => Map[TopicPartition, Long] + ): KafkaSourceOffset = { + val fetched = partitionsAssignedToConsumer { + partitions => { + fnAssertParametersWithPartitions(partitions) + + val partitionOffsets = fnRetrievePartitionOffsets(partitions) + + val listOffsetsParams = partitionOffsets.filter { case (_, off) => + off == KafkaOffsetRangeLimit.LATEST || off == KafkaOffsetRangeLimit.EARLIEST + }.map { case (tp, off) => + off match { + case KafkaOffsetRangeLimit.LATEST => + tp -> OffsetSpec.latest() + case KafkaOffsetRangeLimit.EARLIEST => + tp -> OffsetSpec.earliest() + } + } + val resolvedPartitionOffsets = listOffsets(admin, listOffsetsParams.asJava) + + partitionOffsets.map { case (tp, off) => + off match { + case KafkaOffsetRangeLimit.LATEST => + tp -> resolvedPartitionOffsets(tp) + case KafkaOffsetRangeLimit.EARLIEST => + tp -> resolvedPartitionOffsets(tp) + case _ => + tp -> off + } + } + } + } + + KafkaSourceOffset(fetched) + } + + /** + * Fetch the earliest offsets for the topic partitions that are indicated + * in the [[ConsumerStrategy]]. + */ + override def fetchEarliestOffsets(): Map[TopicPartition, Long] = partitionsAssignedToConsumer( + partitions => { + val listOffsetsParams = partitions.asScala.map(p => p -> OffsetSpec.earliest()).toMap.asJava + val partitionOffsets = listOffsets(admin, listOffsetsParams) + logDebug(s"Got earliest offsets for partitions: $partitionOffsets") + partitionOffsets + }) + + /** + * Fetch the latest offsets for the topic partitions that are indicated + * in the [[ConsumerStrategy]]. + * + * Kafka may return earliest offsets when we are requesting latest offsets if `poll` is called + * right before `seekToEnd` (KAFKA-7703). As a workaround, we will call `position` right after + * `poll` to wait until the potential offset request triggered by `poll(0)` is done. + * + * In addition, to avoid other unknown issues, we also use the given `knownOffsets` to audit the + * latest offsets returned by Kafka. If we find some incorrect offsets (a latest offset is less + * than an offset in `knownOffsets`), we will retry at most `maxOffsetFetchAttempts` times. When + * a topic is recreated, the latest offsets may be less than offsets in `knownOffsets`. We cannot + * distinguish this with KAFKA-7703, so we just return whatever we get from Kafka after retrying. + */ + override def fetchLatestOffsets( + knownOffsets: Option[PartitionOffsetMap]): PartitionOffsetMap = + partitionsAssignedToConsumer { partitions => { + val listOffsetsParams = partitions.asScala.map(_ -> OffsetSpec.latest()).toMap.asJava + if (knownOffsets.isEmpty) { + val partitionOffsets = listOffsets(admin, listOffsetsParams) + logDebug(s"Got latest offsets for partitions: $partitionOffsets") + partitionOffsets + } else { + var partitionOffsets: PartitionOffsetMap = Map.empty + + /** + * Compare `knownOffsets` and `partitionOffsets`. Returns all partitions that have incorrect + * latest offset (offset in `knownOffsets` is great than the one in `partitionOffsets`). + */ + def findIncorrectOffsets(): Seq[(TopicPartition, Long, Long)] = { + var incorrectOffsets = ArrayBuffer[(TopicPartition, Long, Long)]() + partitionOffsets.foreach { case (tp, offset) => + knownOffsets.foreach(_.get(tp).foreach { knownOffset => + if (knownOffset > offset) { + val incorrectOffset = (tp, knownOffset, offset) + incorrectOffsets += incorrectOffset + } + }) + } + // toSeq seems redundant but it's needed for Scala 2.13 + incorrectOffsets.toSeq + } + + // Retry to fetch latest offsets when detecting incorrect offsets. We don't use + // `withRetriesWithoutInterrupt` to retry because: + // + // - `withRetriesWithoutInterrupt` will reset the consumer for each attempt but a fresh + // consumer has a much bigger chance to hit KAFKA-7703. + // - Avoid calling `consumer.poll(0)` which may cause KAFKA-7703. + var incorrectOffsets: Seq[(TopicPartition, Long, Long)] = Nil + var attempt = 0 + do { + partitionOffsets = listOffsets(admin, listOffsetsParams) + attempt += 1 + + incorrectOffsets = findIncorrectOffsets() + if (incorrectOffsets.nonEmpty) { + logWarning("Found incorrect offsets in some partitions " + + s"(partition, previous offset, fetched offset): $incorrectOffsets") + if (attempt < maxOffsetFetchAttempts) { + logWarning("Retrying to fetch latest offsets because of incorrect offsets") + Thread.sleep(offsetFetchAttemptIntervalMs) + } + } + } while (incorrectOffsets.nonEmpty && attempt < maxOffsetFetchAttempts) + + logDebug(s"Got latest offsets for partitions: $partitionOffsets") + partitionOffsets + } + } + } + + /** + * Fetch the earliest offsets for specific topic partitions. + * The return result may not contain some partitions if they are deleted. + */ + override def fetchEarliestOffsets( + newPartitions: Seq[TopicPartition]): Map[TopicPartition, Long] = { + if (newPartitions.isEmpty) { + Map.empty[TopicPartition, Long] + } else { + partitionsAssignedToConsumer(partitions => { + // Get the earliest offset of each partition + val listOffsetsParams = newPartitions.filter { newPartition => + // When deleting topics happen at the same time, some partitions may not be in + // `partitions`. So we need to ignore them + partitions.contains(newPartition) + }.map(partition => partition -> OffsetSpec.earliest()).toMap.asJava + val partitionOffsets = listOffsets(admin, listOffsetsParams) + logDebug(s"Got earliest offsets for new partitions: $partitionOffsets") + partitionOffsets + }) + } + } + + /** + * Return the offset ranges for a Kafka batch query. If `minPartitions` is set, this method may + * split partitions to respect it. Since offsets can be early and late binding which are evaluated + * on the executors, in order to divvy up the partitions we need to perform some substitutions. We + * don't want to send exact offsets to the executors, because data may age out before we can + * consume the data. This method makes some approximate splitting, and replaces the special offset + * values in the final output. + */ + override def getOffsetRangesFromUnresolvedOffsets( + startingOffsets: KafkaOffsetRangeLimit, + endingOffsets: KafkaOffsetRangeLimit): Seq[KafkaOffsetRange] = { + val fromPartitionOffsets = fetchPartitionOffsets(startingOffsets, isStartingOffsets = true) + val untilPartitionOffsets = fetchPartitionOffsets(endingOffsets, isStartingOffsets = false) + + // Obtain topicPartitions in both from and until partition offset, ignoring + // topic partitions that were added and/or deleted between the two above calls. + if (fromPartitionOffsets.keySet != untilPartitionOffsets.keySet) { + implicit val topicOrdering: Ordering[TopicPartition] = Ordering.by(t => t.topic()) + val fromTopics = fromPartitionOffsets.keySet.toList.sorted.mkString(",") + val untilTopics = untilPartitionOffsets.keySet.toList.sorted.mkString(",") + throw new IllegalStateException("different topic partitions " + + s"for starting offsets topics[${fromTopics}] and " + + s"ending offsets topics[${untilTopics}]") + } + + // Calculate offset ranges + val offsetRangesBase = untilPartitionOffsets.keySet.map { tp => + val fromOffset = fromPartitionOffsets.get(tp).getOrElse { + // This should not happen since topicPartitions contains all partitions not in + // fromPartitionOffsets + throw new IllegalStateException(s"$tp doesn't have a from offset") + } + val untilOffset = untilPartitionOffsets(tp) + KafkaOffsetRange(tp, fromOffset, untilOffset, None) + }.toSeq + + if (shouldDivvyUpLargePartitions(offsetRangesBase.size)) { + val fromOffsetsMap = + offsetRangesBase.map(range => (range.topicPartition, range.fromOffset)).toMap + val untilOffsetsMap = + offsetRangesBase.map(range => (range.topicPartition, range.untilOffset)).toMap + + // No need to report data loss here + val resolvedFromOffsets = fetchSpecificOffsets(fromOffsetsMap, _ => ()).partitionToOffsets + val resolvedUntilOffsets = fetchSpecificOffsets(untilOffsetsMap, _ => ()).partitionToOffsets + val ranges = offsetRangesBase.map(_.topicPartition).map { tp => + KafkaOffsetRange(tp, resolvedFromOffsets(tp), resolvedUntilOffsets(tp), preferredLoc = None) + } + val divvied = rangeCalculator.getRanges(ranges).groupBy(_.topicPartition) + divvied.flatMap { case (tp, splitOffsetRanges) => + if (splitOffsetRanges.length == 1) { + Seq(KafkaOffsetRange(tp, fromOffsetsMap(tp), untilOffsetsMap(tp), None)) + } else { + // the list can't be empty + val first = splitOffsetRanges.head.copy(fromOffset = fromOffsetsMap(tp)) + val end = splitOffsetRanges.last.copy(untilOffset = untilOffsetsMap(tp)) + Seq(first) ++ splitOffsetRanges.drop(1).dropRight(1) :+ end + } + }.toArray.toSeq + } else { + offsetRangesBase + } + } + + private def getSortedExecutorList: Array[String] = { + def compare(a: ExecutorCacheTaskLocation, b: ExecutorCacheTaskLocation): Boolean = { + if (a.host == b.host) { + a.executorId > b.executorId + } else { + a.host > b.host + } + } + + val bm = SparkEnv.get.blockManager + bm.master.getPeers(bm.blockManagerId).toArray + .map(x => ExecutorCacheTaskLocation(x.host, x.executorId)) + .sortWith(compare) + .map(_.toString) + } + + /** + * Return the offset ranges for a Kafka streaming batch. If `minPartitions` is set, this method + * may split partitions to respect it. If any data lost issue is detected, `reportDataLoss` will + * be called. + */ + override def getOffsetRangesFromResolvedOffsets( + fromPartitionOffsets: PartitionOffsetMap, + untilPartitionOffsets: PartitionOffsetMap, + reportDataLoss: String => Unit): Seq[KafkaOffsetRange] = { + // Find the new partitions, and get their earliest offsets + val newPartitions = untilPartitionOffsets.keySet.diff(fromPartitionOffsets.keySet) + val newPartitionInitialOffsets = fetchEarliestOffsets(newPartitions.toSeq) + if (newPartitionInitialOffsets.keySet != newPartitions) { + // We cannot get from offsets for some partitions. It means they got deleted. + val deletedPartitions = newPartitions.diff(newPartitionInitialOffsets.keySet) + reportDataLoss( + s"Cannot find earliest offsets of ${deletedPartitions}. Some data may have been missed") + } + logInfo(s"Partitions added: $newPartitionInitialOffsets") + newPartitionInitialOffsets.filter(_._2 != 0).foreach { case (p, o) => + reportDataLoss( + s"Added partition $p starts from $o instead of 0. Some data may have been missed") + } + + val deletedPartitions = fromPartitionOffsets.keySet.diff(untilPartitionOffsets.keySet) + if (deletedPartitions.nonEmpty) { + val message = if (driverKafkaParams.containsKey(ConsumerConfig.GROUP_ID_CONFIG)) { + s"$deletedPartitions are gone. ${KafkaSourceProvider.CUSTOM_GROUP_ID_ERROR_MESSAGE}" + } else { + s"$deletedPartitions are gone. Some data may have been missed." + } + reportDataLoss(message) + } + + // Use the until partitions to calculate offset ranges to ignore partitions that have + // been deleted + val topicPartitions = untilPartitionOffsets.keySet.filter { tp => + // Ignore partitions that we don't know the from offsets. + newPartitionInitialOffsets.contains(tp) || fromPartitionOffsets.contains(tp) + }.toSeq + logDebug("TopicPartitions: " + topicPartitions.mkString(", ")) + + val fromOffsets = fromPartitionOffsets ++ newPartitionInitialOffsets + val untilOffsets = untilPartitionOffsets + val ranges = topicPartitions.map { tp => + val fromOffset = fromOffsets(tp) + val untilOffset = untilOffsets(tp) + if (untilOffset < fromOffset) { + reportDataLoss(s"Partition $tp's offset was changed from " + + s"$fromOffset to $untilOffset, some data may have been missed") + } + KafkaOffsetRange(tp, fromOffset, untilOffset, preferredLoc = None) + } + rangeCalculator.getRanges(ranges, getSortedExecutorList) + } + + private def partitionsAssignedToConsumer( + body: ju.Set[TopicPartition] => Map[TopicPartition, Long]) + : Map[TopicPartition, Long] = uninterruptibleThreadRunner.runUninterruptibly { + + withRetriesWithoutInterrupt { + val partitions = consumerStrategy.assignedTopicPartitions(admin).asJava + logDebug(s"Partitions assigned: $partitions.") + body(partitions) + } + } + + /** + * Helper function that does multiple retries on a body of code that returns offsets. + * Retries are needed to handle transient failures. For e.g. race conditions between getting + * assignment and getting position while topics/partitions are deleted can cause NPEs. + * + * This method also makes sure `body` won't be interrupted to workaround similar issues like in + * `KafkaConsumer.poll`. (KAFKA-1894) + */ + private def withRetriesWithoutInterrupt( + body: => Map[TopicPartition, Long]): Map[TopicPartition, Long] = { + assert(Thread.currentThread().isInstanceOf[UninterruptibleThread]) + + synchronized { + var result: Option[Map[TopicPartition, Long]] = None + var attempt = 1 + var lastException: Throwable = null + while (result.isEmpty && attempt <= maxOffsetFetchAttempts + && !Thread.currentThread().isInterrupted) { + Thread.currentThread match { + case ut: UninterruptibleThread => + ut.runUninterruptibly { + try { + result = Some(body) + } catch { + case NonFatal(e) => + lastException = e + logWarning(s"Error in attempt $attempt getting Kafka offsets: ", e) + attempt += 1 + Thread.sleep(offsetFetchAttemptIntervalMs) + resetAdmin() + } + } + case _ => + throw new IllegalStateException( + "Kafka APIs must be executed on a o.a.spark.util.UninterruptibleThread") + } + } + if (Thread.interrupted()) { + throw new InterruptedException() + } + if (result.isEmpty) { + assert(attempt > maxOffsetFetchAttempts) + assert(lastException != null) + throw lastException + } + result.get + } + } + + private def stopAdmin(): Unit = synchronized { + assert(Thread.currentThread().isInstanceOf[UninterruptibleThread]) + if (_admin != null) _admin.close() + } + + private def resetAdmin(): Unit = synchronized { + stopAdmin() + _admin = null // will automatically get reinitialized again + } +} diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderConsumer.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderConsumer.scala new file mode 100644 index 0000000000000..eca41c510f1f2 --- /dev/null +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderConsumer.scala @@ -0,0 +1,614 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.kafka010 + +import java.{util => ju} + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer +import scala.util.control.NonFatal + +import org.apache.kafka.clients.consumer.{Consumer, ConsumerConfig, OffsetAndTimestamp} +import org.apache.kafka.common.TopicPartition + +import org.apache.spark.SparkEnv +import org.apache.spark.internal.Logging +import org.apache.spark.scheduler.ExecutorCacheTaskLocation +import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.util.{UninterruptibleThread, UninterruptibleThreadRunner} + +/** + * This class uses Kafka's own [[org.apache.kafka.clients.consumer.KafkaConsumer]] API to + * read data offsets from Kafka. + * The [[ConsumerStrategy]] class defines which Kafka topics and partitions should be read + * by this source. These strategies directly correspond to the different consumption options + * in. This class is designed to return a configured + * [[org.apache.kafka.clients.consumer.KafkaConsumer]] that is used by the + * [[KafkaSource]] to query for the offsets. See the docs on + * [[org.apache.spark.sql.kafka010.ConsumerStrategy]] + * for more details. + * + * Note: This class is not ThreadSafe + */ +private[kafka010] class KafkaOffsetReaderConsumer( + consumerStrategy: ConsumerStrategy, + override val driverKafkaParams: ju.Map[String, Object], + readerOptions: CaseInsensitiveMap[String], + driverGroupIdPrefix: String) extends KafkaOffsetReader with Logging { + + /** + * [[UninterruptibleThreadRunner]] ensures that all + * [[org.apache.kafka.clients.consumer.KafkaConsumer]] communication called in an + * [[UninterruptibleThread]]. In the case of streaming queries, we are already running in an + * [[UninterruptibleThread]], however for batch mode this is not the case. + */ + val uninterruptibleThreadRunner = new UninterruptibleThreadRunner("Kafka Offset Reader") + + /** + * Place [[groupId]] and [[nextId]] here so that they are initialized before any consumer is + * created -- see SPARK-19564. + */ + private var groupId: String = null + private var nextId = 0 + + /** + * A KafkaConsumer used in the driver to query the latest Kafka offsets. This only queries the + * offsets and never commits them. + */ + @volatile protected var _consumer: Consumer[Array[Byte], Array[Byte]] = null + + protected def consumer: Consumer[Array[Byte], Array[Byte]] = synchronized { + assert(Thread.currentThread().isInstanceOf[UninterruptibleThread]) + if (_consumer == null) { + val newKafkaParams = new ju.HashMap[String, Object](driverKafkaParams) + if (driverKafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG) == null) { + newKafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, nextGroupId()) + } + _consumer = consumerStrategy.createConsumer(newKafkaParams) + } + _consumer + } + + private[kafka010] val maxOffsetFetchAttempts = + readerOptions.getOrElse(KafkaSourceProvider.FETCH_OFFSET_NUM_RETRY, "3").toInt + + /** + * Number of partitions to read from Kafka. If this value is greater than the number of Kafka + * topicPartitions, we will split up the read tasks of the skewed partitions to multiple Spark + * tasks. The number of Spark tasks will be *approximately* `numPartitions`. It can be less or + * more depending on rounding errors or Kafka partitions that didn't receive any new data. + */ + private val minPartitions = + readerOptions.get(KafkaSourceProvider.MIN_PARTITIONS_OPTION_KEY).map(_.toInt) + + private val rangeCalculator = new KafkaOffsetRangeCalculator(minPartitions) + + private[kafka010] val offsetFetchAttemptIntervalMs = + readerOptions.getOrElse(KafkaSourceProvider.FETCH_OFFSET_RETRY_INTERVAL_MS, "1000").toLong + + /** + * Whether we should divide Kafka TopicPartitions with a lot of data into smaller Spark tasks. + */ + private def shouldDivvyUpLargePartitions(numTopicPartitions: Int): Boolean = { + minPartitions.map(_ > numTopicPartitions).getOrElse(false) + } + + private def nextGroupId(): String = { + groupId = driverGroupIdPrefix + "-" + nextId + nextId += 1 + groupId + } + + override def toString(): String = consumerStrategy.toString + + /** + * Closes the connection to Kafka, and cleans up state. + */ + override def close(): Unit = { + if (_consumer != null) uninterruptibleThreadRunner.runUninterruptibly { stopConsumer() } + uninterruptibleThreadRunner.shutdown() + } + + /** + * @return The Set of TopicPartitions for a given topic + */ + private def fetchTopicPartitions(): Set[TopicPartition] = + uninterruptibleThreadRunner.runUninterruptibly { + assert(Thread.currentThread().isInstanceOf[UninterruptibleThread]) + // Poll to get the latest assigned partitions + consumer.poll(0) + val partitions = consumer.assignment() + consumer.pause(partitions) + partitions.asScala.toSet + } + + /** + * Fetch the partition offsets for the topic partitions that are indicated + * in the [[ConsumerStrategy]] and [[KafkaOffsetRangeLimit]]. + */ + override def fetchPartitionOffsets( + offsetRangeLimit: KafkaOffsetRangeLimit, + isStartingOffsets: Boolean): Map[TopicPartition, Long] = { + def validateTopicPartitions(partitions: Set[TopicPartition], + partitionOffsets: Map[TopicPartition, Long]): Map[TopicPartition, Long] = { + assert(partitions == partitionOffsets.keySet, + "If startingOffsets contains specific offsets, you must specify all TopicPartitions.\n" + + "Use -1 for latest, -2 for earliest.\n" + + s"Specified: ${partitionOffsets.keySet} Assigned: ${partitions}") + logDebug(s"Partitions assigned to consumer: $partitions. Seeking to $partitionOffsets") + partitionOffsets + } + val partitions = fetchTopicPartitions() + // Obtain TopicPartition offsets with late binding support + offsetRangeLimit match { + case EarliestOffsetRangeLimit => partitions.map { + case tp => tp -> KafkaOffsetRangeLimit.EARLIEST + }.toMap + case LatestOffsetRangeLimit => partitions.map { + case tp => tp -> KafkaOffsetRangeLimit.LATEST + }.toMap + case SpecificOffsetRangeLimit(partitionOffsets) => + validateTopicPartitions(partitions, partitionOffsets) + case SpecificTimestampRangeLimit(partitionTimestamps) => + fetchSpecificTimestampBasedOffsets(partitionTimestamps, + failsOnNoMatchingOffset = isStartingOffsets).partitionToOffsets + } + } + + /** + * Resolves the specific offsets based on Kafka seek positions. + * This method resolves offset value -1 to the latest and -2 to the + * earliest Kafka seek position. + * + * @param partitionOffsets the specific offsets to resolve + * @param reportDataLoss callback to either report or log data loss depending on setting + */ + override def fetchSpecificOffsets( + partitionOffsets: Map[TopicPartition, Long], + reportDataLoss: String => Unit): KafkaSourceOffset = { + val fnAssertParametersWithPartitions: ju.Set[TopicPartition] => Unit = { partitions => + assert(partitions.asScala == partitionOffsets.keySet, + "If startingOffsets contains specific offsets, you must specify all TopicPartitions.\n" + + "Use -1 for latest, -2 for earliest, if you don't care.\n" + + s"Specified: ${partitionOffsets.keySet} Assigned: ${partitions.asScala}") + logDebug(s"Partitions assigned to consumer: $partitions. Seeking to $partitionOffsets") + } + + val fnRetrievePartitionOffsets: ju.Set[TopicPartition] => Map[TopicPartition, Long] = { _ => + partitionOffsets + } + + val fnAssertFetchedOffsets: Map[TopicPartition, Long] => Unit = { fetched => + partitionOffsets.foreach { + case (tp, off) if off != KafkaOffsetRangeLimit.LATEST && + off != KafkaOffsetRangeLimit.EARLIEST => + if (fetched(tp) != off) { + reportDataLoss( + s"startingOffsets for $tp was $off but consumer reset to ${fetched(tp)}") + } + case _ => + // no real way to check that beginning or end is reasonable + } + } + + fetchSpecificOffsets0(fnAssertParametersWithPartitions, fnRetrievePartitionOffsets, + fnAssertFetchedOffsets) + } + + override def fetchSpecificTimestampBasedOffsets( + partitionTimestamps: Map[TopicPartition, Long], + failsOnNoMatchingOffset: Boolean): KafkaSourceOffset = { + val fnAssertParametersWithPartitions: ju.Set[TopicPartition] => Unit = { partitions => + assert(partitions.asScala == partitionTimestamps.keySet, + "If starting/endingOffsetsByTimestamp contains specific offsets, you must specify all " + + s"topics. Specified: ${partitionTimestamps.keySet} Assigned: ${partitions.asScala}") + logDebug(s"Partitions assigned to consumer: $partitions. Seeking to $partitionTimestamps") + } + + val fnRetrievePartitionOffsets: ju.Set[TopicPartition] => Map[TopicPartition, Long] = { _ => { + val converted = partitionTimestamps.map { case (tp, timestamp) => + tp -> java.lang.Long.valueOf(timestamp) + }.asJava + + val offsetForTime: ju.Map[TopicPartition, OffsetAndTimestamp] = + consumer.offsetsForTimes(converted) + + offsetForTime.asScala.map { case (tp, offsetAndTimestamp) => + if (failsOnNoMatchingOffset) { + assert(offsetAndTimestamp != null, "No offset matched from request of " + + s"topic-partition $tp and timestamp ${partitionTimestamps(tp)}.") + } + + if (offsetAndTimestamp == null) { + tp -> KafkaOffsetRangeLimit.LATEST + } else { + tp -> offsetAndTimestamp.offset() + } + }.toMap + } + } + + val fnAssertFetchedOffsets: Map[TopicPartition, Long] => Unit = { _ => } + + fetchSpecificOffsets0(fnAssertParametersWithPartitions, fnRetrievePartitionOffsets, + fnAssertFetchedOffsets) + } + + private def fetchSpecificOffsets0( + fnAssertParametersWithPartitions: ju.Set[TopicPartition] => Unit, + fnRetrievePartitionOffsets: ju.Set[TopicPartition] => Map[TopicPartition, Long], + fnAssertFetchedOffsets: Map[TopicPartition, Long] => Unit): KafkaSourceOffset = { + val fetched = partitionsAssignedToConsumer { + partitions => { + fnAssertParametersWithPartitions(partitions) + + val partitionOffsets = fnRetrievePartitionOffsets(partitions) + + partitionOffsets.foreach { + case (tp, KafkaOffsetRangeLimit.LATEST) => + consumer.seekToEnd(ju.Arrays.asList(tp)) + case (tp, KafkaOffsetRangeLimit.EARLIEST) => + consumer.seekToBeginning(ju.Arrays.asList(tp)) + case (tp, off) => consumer.seek(tp, off) + } + + partitionOffsets.map { + case (tp, _) => tp -> consumer.position(tp) + } + } + } + + fnAssertFetchedOffsets(fetched) + + KafkaSourceOffset(fetched) + } + + /** + * Fetch the earliest offsets for the topic partitions that are indicated + * in the [[ConsumerStrategy]]. + */ + override def fetchEarliestOffsets(): Map[TopicPartition, Long] = partitionsAssignedToConsumer( + partitions => { + logDebug("Seeking to the beginning") + + consumer.seekToBeginning(partitions) + val partitionOffsets = partitions.asScala.map(p => p -> consumer.position(p)).toMap + logDebug(s"Got earliest offsets for partition : $partitionOffsets") + partitionOffsets + }, fetchingEarliestOffset = true) + + /** + * Fetch the latest offsets for the topic partitions that are indicated + * in the [[ConsumerStrategy]]. + * + * Kafka may return earliest offsets when we are requesting latest offsets if `poll` is called + * right before `seekToEnd` (KAFKA-7703). As a workaround, we will call `position` right after + * `poll` to wait until the potential offset request triggered by `poll(0)` is done. + * + * In addition, to avoid other unknown issues, we also use the given `knownOffsets` to audit the + * latest offsets returned by Kafka. If we find some incorrect offsets (a latest offset is less + * than an offset in `knownOffsets`), we will retry at most `maxOffsetFetchAttempts` times. When + * a topic is recreated, the latest offsets may be less than offsets in `knownOffsets`. We cannot + * distinguish this with KAFKA-7703, so we just return whatever we get from Kafka after retrying. + */ + override def fetchLatestOffsets( + knownOffsets: Option[PartitionOffsetMap]): PartitionOffsetMap = + partitionsAssignedToConsumer { partitions => { + logDebug("Seeking to the end.") + + if (knownOffsets.isEmpty) { + consumer.seekToEnd(partitions) + partitions.asScala.map(p => p -> consumer.position(p)).toMap + } else { + var partitionOffsets: PartitionOffsetMap = Map.empty + + /** + * Compare `knownOffsets` and `partitionOffsets`. Returns all partitions that have incorrect + * latest offset (offset in `knownOffsets` is great than the one in `partitionOffsets`). + */ + def findIncorrectOffsets(): Seq[(TopicPartition, Long, Long)] = { + var incorrectOffsets = ArrayBuffer[(TopicPartition, Long, Long)]() + partitionOffsets.foreach { case (tp, offset) => + knownOffsets.foreach(_.get(tp).foreach { knownOffset => + if (knownOffset > offset) { + val incorrectOffset = (tp, knownOffset, offset) + incorrectOffsets += incorrectOffset + } + }) + } + incorrectOffsets.toSeq + } + + // Retry to fetch latest offsets when detecting incorrect offsets. We don't use + // `withRetriesWithoutInterrupt` to retry because: + // + // - `withRetriesWithoutInterrupt` will reset the consumer for each attempt but a fresh + // consumer has a much bigger chance to hit KAFKA-7703. + // - Avoid calling `consumer.poll(0)` which may cause KAFKA-7703. + var incorrectOffsets: Seq[(TopicPartition, Long, Long)] = Nil + var attempt = 0 + do { + consumer.seekToEnd(partitions) + partitionOffsets = partitions.asScala.map(p => p -> consumer.position(p)).toMap + attempt += 1 + + incorrectOffsets = findIncorrectOffsets() + if (incorrectOffsets.nonEmpty) { + logWarning("Found incorrect offsets in some partitions " + + s"(partition, previous offset, fetched offset): $incorrectOffsets") + if (attempt < maxOffsetFetchAttempts) { + logWarning("Retrying to fetch latest offsets because of incorrect offsets") + Thread.sleep(offsetFetchAttemptIntervalMs) + } + } + } while (incorrectOffsets.nonEmpty && attempt < maxOffsetFetchAttempts) + + logDebug(s"Got latest offsets for partition : $partitionOffsets") + partitionOffsets + } + } + } + + /** + * Fetch the earliest offsets for specific topic partitions. + * The return result may not contain some partitions if they are deleted. + */ + override def fetchEarliestOffsets( + newPartitions: Seq[TopicPartition]): Map[TopicPartition, Long] = { + if (newPartitions.isEmpty) { + Map.empty[TopicPartition, Long] + } else { + partitionsAssignedToConsumer(partitions => { + // Get the earliest offset of each partition + consumer.seekToBeginning(partitions) + val partitionOffsets = newPartitions.filter { p => + // When deleting topics happen at the same time, some partitions may not be in + // `partitions`. So we need to ignore them + partitions.contains(p) + }.map(p => p -> consumer.position(p)).toMap + logDebug(s"Got earliest offsets for new partitions: $partitionOffsets") + partitionOffsets + }, fetchingEarliestOffset = true) + } + } + + /** + * Return the offset ranges for a Kafka batch query. If `minPartitions` is set, this method may + * split partitions to respect it. Since offsets can be early and late binding which are evaluated + * on the executors, in order to divvy up the partitions we need to perform some substitutions. We + * don't want to send exact offsets to the executors, because data may age out before we can + * consume the data. This method makes some approximate splitting, and replaces the special offset + * values in the final output. + */ + override def getOffsetRangesFromUnresolvedOffsets( + startingOffsets: KafkaOffsetRangeLimit, + endingOffsets: KafkaOffsetRangeLimit): Seq[KafkaOffsetRange] = { + val fromPartitionOffsets = fetchPartitionOffsets(startingOffsets, isStartingOffsets = true) + val untilPartitionOffsets = fetchPartitionOffsets(endingOffsets, isStartingOffsets = false) + + // Obtain topicPartitions in both from and until partition offset, ignoring + // topic partitions that were added and/or deleted between the two above calls. + if (fromPartitionOffsets.keySet != untilPartitionOffsets.keySet) { + implicit val topicOrdering: Ordering[TopicPartition] = Ordering.by(t => t.topic()) + val fromTopics = fromPartitionOffsets.keySet.toList.sorted.mkString(",") + val untilTopics = untilPartitionOffsets.keySet.toList.sorted.mkString(",") + throw new IllegalStateException("different topic partitions " + + s"for starting offsets topics[${fromTopics}] and " + + s"ending offsets topics[${untilTopics}]") + } + + // Calculate offset ranges + val offsetRangesBase = untilPartitionOffsets.keySet.map { tp => + val fromOffset = fromPartitionOffsets.get(tp).getOrElse { + // This should not happen since topicPartitions contains all partitions not in + // fromPartitionOffsets + throw new IllegalStateException(s"$tp doesn't have a from offset") + } + val untilOffset = untilPartitionOffsets(tp) + KafkaOffsetRange(tp, fromOffset, untilOffset, None) + }.toSeq + + if (shouldDivvyUpLargePartitions(offsetRangesBase.size)) { + val fromOffsetsMap = + offsetRangesBase.map(range => (range.topicPartition, range.fromOffset)).toMap + val untilOffsetsMap = + offsetRangesBase.map(range => (range.topicPartition, range.untilOffset)).toMap + + // No need to report data loss here + val resolvedFromOffsets = fetchSpecificOffsets(fromOffsetsMap, _ => ()).partitionToOffsets + val resolvedUntilOffsets = fetchSpecificOffsets(untilOffsetsMap, _ => ()).partitionToOffsets + val ranges = offsetRangesBase.map(_.topicPartition).map { tp => + KafkaOffsetRange(tp, resolvedFromOffsets(tp), resolvedUntilOffsets(tp), preferredLoc = None) + } + val divvied = rangeCalculator.getRanges(ranges).groupBy(_.topicPartition) + divvied.flatMap { case (tp, splitOffsetRanges) => + if (splitOffsetRanges.length == 1) { + Seq(KafkaOffsetRange(tp, fromOffsetsMap(tp), untilOffsetsMap(tp), None)) + } else { + // the list can't be empty + val first = splitOffsetRanges.head.copy(fromOffset = fromOffsetsMap(tp)) + val end = splitOffsetRanges.last.copy(untilOffset = untilOffsetsMap(tp)) + Seq(first) ++ splitOffsetRanges.drop(1).dropRight(1) :+ end + } + }.toArray.toSeq + } else { + offsetRangesBase + } + } + + private def getSortedExecutorList(): Array[String] = { + def compare(a: ExecutorCacheTaskLocation, b: ExecutorCacheTaskLocation): Boolean = { + if (a.host == b.host) { + a.executorId > b.executorId + } else { + a.host > b.host + } + } + + val bm = SparkEnv.get.blockManager + bm.master.getPeers(bm.blockManagerId).toArray + .map(x => ExecutorCacheTaskLocation(x.host, x.executorId)) + .sortWith(compare) + .map(_.toString) + } + + /** + * Return the offset ranges for a Kafka streaming batch. If `minPartitions` is set, this method + * may split partitions to respect it. If any data lost issue is detected, `reportDataLoss` will + * be called. + */ + override def getOffsetRangesFromResolvedOffsets( + fromPartitionOffsets: PartitionOffsetMap, + untilPartitionOffsets: PartitionOffsetMap, + reportDataLoss: String => Unit): Seq[KafkaOffsetRange] = { + // Find the new partitions, and get their earliest offsets + val newPartitions = untilPartitionOffsets.keySet.diff(fromPartitionOffsets.keySet) + val newPartitionInitialOffsets = fetchEarliestOffsets(newPartitions.toSeq) + if (newPartitionInitialOffsets.keySet != newPartitions) { + // We cannot get from offsets for some partitions. It means they got deleted. + val deletedPartitions = newPartitions.diff(newPartitionInitialOffsets.keySet) + reportDataLoss( + s"Cannot find earliest offsets of ${deletedPartitions}. Some data may have been missed") + } + logInfo(s"Partitions added: $newPartitionInitialOffsets") + newPartitionInitialOffsets.filter(_._2 != 0).foreach { case (p, o) => + reportDataLoss( + s"Added partition $p starts from $o instead of 0. Some data may have been missed") + } + + val deletedPartitions = fromPartitionOffsets.keySet.diff(untilPartitionOffsets.keySet) + if (deletedPartitions.nonEmpty) { + val message = if (driverKafkaParams.containsKey(ConsumerConfig.GROUP_ID_CONFIG)) { + s"$deletedPartitions are gone. ${KafkaSourceProvider.CUSTOM_GROUP_ID_ERROR_MESSAGE}" + } else { + s"$deletedPartitions are gone. Some data may have been missed." + } + reportDataLoss(message) + } + + // Use the until partitions to calculate offset ranges to ignore partitions that have + // been deleted + val topicPartitions = untilPartitionOffsets.keySet.filter { tp => + // Ignore partitions that we don't know the from offsets. + newPartitionInitialOffsets.contains(tp) || fromPartitionOffsets.contains(tp) + }.toSeq + logDebug("TopicPartitions: " + topicPartitions.mkString(", ")) + + val fromOffsets = fromPartitionOffsets ++ newPartitionInitialOffsets + val untilOffsets = untilPartitionOffsets + val ranges = topicPartitions.map { tp => + val fromOffset = fromOffsets(tp) + val untilOffset = untilOffsets(tp) + if (untilOffset < fromOffset) { + reportDataLoss(s"Partition $tp's offset was changed from " + + s"$fromOffset to $untilOffset, some data may have been missed") + } + KafkaOffsetRange(tp, fromOffset, untilOffset, preferredLoc = None) + } + rangeCalculator.getRanges(ranges, getSortedExecutorList) + } + + private def partitionsAssignedToConsumer( + body: ju.Set[TopicPartition] => Map[TopicPartition, Long], + fetchingEarliestOffset: Boolean = false) + : Map[TopicPartition, Long] = uninterruptibleThreadRunner.runUninterruptibly { + + withRetriesWithoutInterrupt { + // Poll to get the latest assigned partitions + consumer.poll(0) + val partitions = consumer.assignment() + + if (!fetchingEarliestOffset) { + // Call `position` to wait until the potential offset request triggered by `poll(0)` is + // done. This is a workaround for KAFKA-7703, which an async `seekToBeginning` triggered by + // `poll(0)` may reset offsets that should have been set by another request. + partitions.asScala.map(p => p -> consumer.position(p)).foreach(_ => {}) + } + + consumer.pause(partitions) + logDebug(s"Partitions assigned to consumer: $partitions.") + body(partitions) + } + } + + /** + * Helper function that does multiple retries on a body of code that returns offsets. + * Retries are needed to handle transient failures. For e.g. race conditions between getting + * assignment and getting position while topics/partitions are deleted can cause NPEs. + * + * This method also makes sure `body` won't be interrupted to workaround a potential issue in + * `KafkaConsumer.poll`. (KAFKA-1894) + */ + private def withRetriesWithoutInterrupt( + body: => Map[TopicPartition, Long]): Map[TopicPartition, Long] = { + // Make sure `KafkaConsumer.poll` won't be interrupted (KAFKA-1894) + assert(Thread.currentThread().isInstanceOf[UninterruptibleThread]) + + synchronized { + var result: Option[Map[TopicPartition, Long]] = None + var attempt = 1 + var lastException: Throwable = null + while (result.isEmpty && attempt <= maxOffsetFetchAttempts + && !Thread.currentThread().isInterrupted) { + Thread.currentThread match { + case ut: UninterruptibleThread => + // "KafkaConsumer.poll" may hang forever if the thread is interrupted (E.g., the query + // is stopped)(KAFKA-1894). Hence, we just make sure we don't interrupt it. + // + // If the broker addresses are wrong, or Kafka cluster is down, "KafkaConsumer.poll" may + // hang forever as well. This cannot be resolved in KafkaSource until Kafka fixes the + // issue. + ut.runUninterruptibly { + try { + result = Some(body) + } catch { + case NonFatal(e) => + lastException = e + logWarning(s"Error in attempt $attempt getting Kafka offsets: ", e) + attempt += 1 + Thread.sleep(offsetFetchAttemptIntervalMs) + resetConsumer() + } + } + case _ => + throw new IllegalStateException( + "Kafka APIs must be executed on a o.a.spark.util.UninterruptibleThread") + } + } + if (Thread.interrupted()) { + throw new InterruptedException() + } + if (result.isEmpty) { + assert(attempt > maxOffsetFetchAttempts) + assert(lastException != null) + throw lastException + } + result.get + } + } + + private def stopConsumer(): Unit = synchronized { + assert(Thread.currentThread().isInstanceOf[UninterruptibleThread]) + if (_consumer != null) _consumer.close() + } + + private def resetConsumer(): Unit = synchronized { + stopConsumer() + _consumer = null // will automatically get reinitialized again + } +} diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala index 69a66e2209773..ed3407c822b96 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala @@ -56,7 +56,7 @@ private[kafka010] class KafkaRelation( // id. Hence, we should generate a unique id for each query. val uniqueGroupId = KafkaSourceProvider.batchUniqueGroupId(sourceOptions) - val kafkaOffsetReader = new KafkaOffsetReader( + val kafkaOffsetReader = KafkaOffsetReader.build( strategy, KafkaSourceProvider.kafkaParamsForDriver(specifiedKafkaParams), sourceOptions, diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala index 3ace0874674b6..7299b182ae1cc 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala @@ -93,7 +93,7 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister caseInsensitiveParameters, STARTING_OFFSETS_BY_TIMESTAMP_OPTION_KEY, STARTING_OFFSETS_OPTION_KEY, LatestOffsetRangeLimit) - val kafkaOffsetReader = new KafkaOffsetReader( + val kafkaOffsetReader = KafkaOffsetReader.build( strategy(caseInsensitiveParameters), kafkaParamsForDriver(specifiedKafkaParams), caseInsensitiveParameters, @@ -460,7 +460,7 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister caseInsensitiveOptions, STARTING_OFFSETS_BY_TIMESTAMP_OPTION_KEY, STARTING_OFFSETS_OPTION_KEY, LatestOffsetRangeLimit) - val kafkaOffsetReader = new KafkaOffsetReader( + val kafkaOffsetReader = KafkaOffsetReader.build( strategy(caseInsensitiveOptions), kafkaParamsForDriver(specifiedKafkaParams), caseInsensitiveOptions, @@ -489,7 +489,7 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister caseInsensitiveOptions, STARTING_OFFSETS_BY_TIMESTAMP_OPTION_KEY, STARTING_OFFSETS_OPTION_KEY, LatestOffsetRangeLimit) - val kafkaOffsetReader = new KafkaOffsetReader( + val kafkaOffsetReader = KafkaOffsetReader.build( strategy(caseInsensitiveOptions), kafkaParamsForDriver(specifiedKafkaParams), caseInsensitiveOptions, diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/ConsumerStrategySuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/ConsumerStrategySuite.scala new file mode 100644 index 0000000000000..939cf0bb36a8c --- /dev/null +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/ConsumerStrategySuite.scala @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.kafka010 + +import java.util.UUID + +import scala.collection.JavaConverters._ + +import org.apache.kafka.clients.CommonClientConfigs +import org.apache.kafka.clients.admin.Admin +import org.apache.kafka.common.TopicPartition +import org.mockito.Mockito.mock + +import org.apache.spark.{SparkConf, SparkEnv, SparkFunSuite} + +class ConsumerStrategySuite extends SparkFunSuite { + private var testUtils: KafkaTestUtils = _ + + private def doReturn(value: Any) = org.mockito.Mockito.doReturn(value, Seq.empty: _*) + + protected def newTopic(prefix: String = "topic") = s"$prefix-${UUID.randomUUID().toString}" + + private def setSparkEnv(settings: Iterable[(String, String)]): Unit = { + val conf = new SparkConf().setAll(settings) + val env = mock(classOf[SparkEnv]) + doReturn(conf).when(env).conf + SparkEnv.set(env) + } + + private def adminProps = { + Map[String, Object]( + CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG -> testUtils.brokerAddress + ).asJava + } + + private def admin(strategy: ConsumerStrategy): Admin = { + strategy.createAdmin(adminProps) + } + + override def beforeAll(): Unit = { + super.beforeAll() + testUtils = new KafkaTestUtils(Map.empty) + testUtils.setup() + setSparkEnv(Map.empty) + } + + override def afterAll(): Unit = { + if (testUtils != null) { + testUtils.teardown() + testUtils = null + } + super.afterAll() + } + + test("createAdmin must create admin properly") { + val strategy = AssignStrategy(Array.empty) + assert(strategy.createAdmin(adminProps) != null) + } + + test("AssignStrategy.assignedTopicPartitions must give back all assigned") { + val assignedTopic = newTopic() + testUtils.createTopic(assignedTopic, partitions = 3) + val otherExistingTopic = newTopic() + testUtils.createTopic(otherExistingTopic, partitions = 2) + + val partitions = Array( + new TopicPartition(assignedTopic, 0), + new TopicPartition(assignedTopic, 2) + ) + val strategy = AssignStrategy(partitions) + assert(strategy.assignedTopicPartitions(admin(strategy)) === partitions.toSet) + + testUtils.deleteTopic(assignedTopic) + testUtils.deleteTopic(otherExistingTopic) + } + + test("AssignStrategy.assignedTopicPartitions must skip invalid partitions") { + val assignedTopic = newTopic() + testUtils.createTopic(assignedTopic, partitions = 1) + + val partitions = Array(new TopicPartition(assignedTopic, 1)) + val strategy = AssignStrategy(partitions) + assert(strategy.assignedTopicPartitions(admin(strategy)) === Set.empty) + + testUtils.deleteTopic(assignedTopic) + } + + test("SubscribeStrategy.assignedTopicPartitions must give back all assigned") { + val subscribedTopic1 = newTopic() + testUtils.createTopic(subscribedTopic1, partitions = 2) + val subscribedTopic2 = newTopic() + testUtils.createTopic(subscribedTopic2, partitions = 2) + val otherExistingTopic = newTopic() + testUtils.createTopic(otherExistingTopic, partitions = 2) + + val partitions = Set( + new TopicPartition(subscribedTopic1, 0), + new TopicPartition(subscribedTopic1, 1), + new TopicPartition(subscribedTopic2, 0), + new TopicPartition(subscribedTopic2, 1) + ) + val strategy = SubscribeStrategy(Seq(subscribedTopic1, subscribedTopic2)) + assert(strategy.assignedTopicPartitions(admin(strategy)) === partitions) + + testUtils.deleteTopic(subscribedTopic1) + testUtils.deleteTopic(subscribedTopic2) + testUtils.deleteTopic(otherExistingTopic) + } + + test("SubscribePatternStrategy.assignedTopicPartitions must give back all assigned") { + val subscribePattern = "subscribePattern" + val subscribedTopic1 = newTopic(subscribePattern) + testUtils.createTopic(subscribedTopic1, partitions = 2) + val subscribedTopic2 = newTopic(subscribePattern) + testUtils.createTopic(subscribedTopic2, partitions = 2) + val otherExistingTopic = newTopic("other") + testUtils.createTopic(otherExistingTopic, partitions = 2) + + val partitions = Set( + new TopicPartition(subscribedTopic1, 0), + new TopicPartition(subscribedTopic1, 1), + new TopicPartition(subscribedTopic2, 0), + new TopicPartition(subscribedTopic2, 1) + ) + val strategy = SubscribePatternStrategy(s"$subscribePattern.*") + assert(strategy.assignedTopicPartitions(admin(strategy)) === partitions) + + testUtils.deleteTopic(subscribedTopic1) + testUtils.deleteTopic(subscribedTopic2) + testUtils.deleteTopic(otherExistingTopic) + } +} diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala index 08f673455d729..f2be8475151e3 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala @@ -608,7 +608,9 @@ abstract class KafkaMicroBatchSourceSuiteBase extends KafkaSourceSuiteBase { // in executors. val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new ForeachWriter[Int] { override def open(partitionId: Long, version: Long): Boolean = { + // Re-create topic since Kafka auto topic creation is not supported by Spark KafkaSourceSuite.globalTestUtils.deleteTopic(topic) + KafkaSourceSuite.globalTestUtils.createTopic(topic) true } @@ -690,19 +692,25 @@ abstract class KafkaMicroBatchSourceSuiteBase extends KafkaSourceSuiteBase { } test("allow group.id prefix") { - testGroupId("groupIdPrefix", (expected, actual) => { - assert(actual.exists(_.startsWith(expected)) && !actual.exists(_ === expected), - "Valid consumer groups don't contain the expected group id - " + - s"Valid consumer groups: $actual / expected group id: $expected") - }) + // Group ID prefix is only supported by consumer based offset reader + if (spark.conf.get(SQLConf.USE_DEPRECATED_KAFKA_OFFSET_FETCHING)) { + testGroupId("groupIdPrefix", (expected, actual) => { + assert(actual.exists(_.startsWith(expected)) && !actual.exists(_ === expected), + "Valid consumer groups don't contain the expected group id - " + + s"Valid consumer groups: $actual / expected group id: $expected") + }) + } } test("allow group.id override") { - testGroupId("kafka.group.id", (expected, actual) => { - assert(actual.exists(_ === expected), "Valid consumer groups don't " + - s"contain the expected group id - Valid consumer groups: $actual / " + - s"expected group id: $expected") - }) + // Group ID override is only supported by consumer based offset reader + if (spark.conf.get(SQLConf.USE_DEPRECATED_KAFKA_OFFSET_FETCHING)) { + testGroupId("kafka.group.id", (expected, actual) => { + assert(actual.exists(_ === expected), "Valid consumer groups don't " + + s"contain the expected group id - Valid consumer groups: $actual / " + + s"expected group id: $expected") + }) + } } private def testGroupId(groupIdKey: String, @@ -1121,6 +1129,20 @@ abstract class KafkaMicroBatchSourceSuiteBase extends KafkaSourceSuiteBase { } +class KafkaMicroBatchV1SourceWithAdminSuite extends KafkaMicroBatchV1SourceSuite { + override def beforeAll(): Unit = { + super.beforeAll() + spark.conf.set(SQLConf.USE_DEPRECATED_KAFKA_OFFSET_FETCHING.key, "false") + } +} + +class KafkaMicroBatchV2SourceWithAdminSuite extends KafkaMicroBatchV2SourceSuite { + override def beforeAll(): Unit = { + super.beforeAll() + spark.conf.set(SQLConf.USE_DEPRECATED_KAFKA_OFFSET_FETCHING.key, "false") + } +} + class KafkaMicroBatchV1SourceSuite extends KafkaMicroBatchSourceSuiteBase { override def beforeAll(): Unit = { super.beforeAll() diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderSuite.scala index ad22a56d9157f..d1e49b0e14314 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderSuite.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderSuite.scala @@ -17,13 +17,17 @@ package org.apache.spark.sql.kafka010 +import java.util.Locale import java.util.UUID import java.util.concurrent.atomic.AtomicInteger -import org.apache.kafka.common.TopicPartition +import org.apache.kafka.clients.CommonClientConfigs +import org.apache.kafka.clients.consumer.ConsumerConfig +import org.apache.kafka.common.{IsolationLevel, TopicPartition} import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.kafka010.KafkaOffsetRangeLimit.{EARLIEST, LATEST} import org.apache.spark.sql.test.SharedSparkSession @@ -53,9 +57,9 @@ class KafkaOffsetReaderSuite extends QueryTest with SharedSparkSession with Kafk } private def createKafkaReader(topic: String, minPartitions: Option[Int]): KafkaOffsetReader = { - new KafkaOffsetReader( + KafkaOffsetReader.build( SubscribeStrategy(Seq(topic)), - org.apache.spark.sql.kafka010.KafkaSourceProvider.kafkaParamsForDriver( + KafkaSourceProvider.kafkaParamsForDriver( Map( "bootstrap.servers" -> testUtils.brokerAddress @@ -66,7 +70,39 @@ class KafkaOffsetReaderSuite extends QueryTest with SharedSparkSession with Kafk ) } - test("SPARK-30656: getOffsetRangesFromUnresolvedOffsets - using specific offsets") { + test("isolationLevel must give back default isolation level when not set") { + testIsolationLevel(None, + IsolationLevel.valueOf(ConsumerConfig.DEFAULT_ISOLATION_LEVEL.toUpperCase(Locale.ROOT))) + } + + test("isolationLevel must give back READ_UNCOMMITTED when set") { + testIsolationLevel(Some("read_uncommitted"), IsolationLevel.READ_UNCOMMITTED) + } + + test("isolationLevel must give back READ_COMMITTED when set") { + testIsolationLevel(Some("read_committed"), IsolationLevel.READ_COMMITTED) + } + + test("isolationLevel must throw exception when invalid isolation level set") { + intercept[IllegalArgumentException] { + testIsolationLevel(Some("intentionally_invalid"), IsolationLevel.READ_COMMITTED) + } + } + + private def testIsolationLevel(kafkaParam: Option[String], isolationLevel: IsolationLevel) = { + var kafkaParams = Map(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG -> testUtils.brokerAddress) + kafkaParam.foreach(p => kafkaParams ++= Map(ConsumerConfig.ISOLATION_LEVEL_CONFIG -> p)) + val reader = new KafkaOffsetReaderAdmin( + SubscribeStrategy(Seq()), + KafkaSourceProvider.kafkaParamsForDriver(kafkaParams), + CaseInsensitiveMap(Map.empty), + "" + ) + assert(reader.isolationLevel === isolationLevel) + } + + testWithAllOffsetFetchingSQLConf("SPARK-30656: getOffsetRangesFromUnresolvedOffsets - " + + "using specific offsets") { val topic = newTopic() testUtils.createTopic(topic, partitions = 1) testUtils.sendMessages(topic, (0 until 10).map(_.toString).toArray, Some(0)) @@ -74,14 +110,16 @@ class KafkaOffsetReaderSuite extends QueryTest with SharedSparkSession with Kafk val reader = createKafkaReader(topic, minPartitions = Some(3)) val startingOffsets = SpecificOffsetRangeLimit(Map(tp -> 1)) val endingOffsets = SpecificOffsetRangeLimit(Map(tp -> 4)) - val offsetRanges = reader.getOffsetRangesFromUnresolvedOffsets(startingOffsets, endingOffsets) - assert(offsetRanges === Seq( + val offsetRanges = reader.getOffsetRangesFromUnresolvedOffsets(startingOffsets, + endingOffsets) + assert(offsetRanges.sortBy(_.topicPartition.toString) === Seq( KafkaOffsetRange(tp, 1, 2, None), KafkaOffsetRange(tp, 2, 3, None), - KafkaOffsetRange(tp, 3, 4, None))) + KafkaOffsetRange(tp, 3, 4, None)).sortBy(_.topicPartition.toString)) } - test("SPARK-30656: getOffsetRangesFromUnresolvedOffsets - using special offsets") { + testWithAllOffsetFetchingSQLConf("SPARK-30656: getOffsetRangesFromUnresolvedOffsets - " + + "using special offsets") { val topic = newTopic() testUtils.createTopic(topic, partitions = 1) testUtils.sendMessages(topic, (0 until 4).map(_.toString).toArray, Some(0)) @@ -89,14 +127,16 @@ class KafkaOffsetReaderSuite extends QueryTest with SharedSparkSession with Kafk val reader = createKafkaReader(topic, minPartitions = Some(3)) val startingOffsets = EarliestOffsetRangeLimit val endingOffsets = LatestOffsetRangeLimit - val offsetRanges = reader.getOffsetRangesFromUnresolvedOffsets(startingOffsets, endingOffsets) - assert(offsetRanges === Seq( + val offsetRanges = reader.getOffsetRangesFromUnresolvedOffsets(startingOffsets, + endingOffsets) + assert(offsetRanges.sortBy(_.topicPartition.toString) === Seq( KafkaOffsetRange(tp, EARLIEST, 1, None), KafkaOffsetRange(tp, 1, 2, None), - KafkaOffsetRange(tp, 2, LATEST, None))) + KafkaOffsetRange(tp, 2, LATEST, None)).sortBy(_.topicPartition.toString)) } - test("SPARK-30656: getOffsetRangesFromUnresolvedOffsets - multiple topic partitions") { + testWithAllOffsetFetchingSQLConf("SPARK-30656: getOffsetRangesFromUnresolvedOffsets - " + + "multiple topic partitions") { val topic = newTopic() testUtils.createTopic(topic, partitions = 2) testUtils.sendMessages(topic, (0 until 100).map(_.toString).toArray, Some(0)) @@ -107,15 +147,16 @@ class KafkaOffsetReaderSuite extends QueryTest with SharedSparkSession with Kafk val startingOffsets = SpecificOffsetRangeLimit(Map(tp1 -> EARLIEST, tp2 -> EARLIEST)) val endingOffsets = SpecificOffsetRangeLimit(Map(tp1 -> LATEST, tp2 -> 3)) - val offsetRanges = reader.getOffsetRangesFromUnresolvedOffsets(startingOffsets, endingOffsets) - assert(offsetRanges === Seq( + val offsetRanges = reader.getOffsetRangesFromUnresolvedOffsets(startingOffsets, + endingOffsets) + assert(offsetRanges.sortBy(_.topicPartition.toString) === Seq( KafkaOffsetRange(tp2, EARLIEST, 3, None), KafkaOffsetRange(tp1, EARLIEST, 33, None), KafkaOffsetRange(tp1, 33, 66, None), - KafkaOffsetRange(tp1, 66, LATEST, None))) + KafkaOffsetRange(tp1, 66, LATEST, None)).sortBy(_.topicPartition.toString)) } - test("SPARK-30656: getOffsetRangesFromResolvedOffsets") { + testWithAllOffsetFetchingSQLConf("SPARK-30656: getOffsetRangesFromResolvedOffsets") { val topic = newTopic() testUtils.createTopic(topic, partitions = 2) testUtils.sendMessages(topic, (0 until 100).map(_.toString).toArray, Some(0)) @@ -130,10 +171,28 @@ class KafkaOffsetReaderSuite extends QueryTest with SharedSparkSession with Kafk fromPartitionOffsets, untilPartitionOffsets, _ => {}) - assert(offsetRanges === Seq( + assert(offsetRanges.sortBy(_.topicPartition.toString) === Seq( KafkaOffsetRange(tp1, 0, 33, None), KafkaOffsetRange(tp1, 33, 66, None), KafkaOffsetRange(tp1, 66, 100, None), - KafkaOffsetRange(tp2, 0, 3, None))) + KafkaOffsetRange(tp2, 0, 3, None)).sortBy(_.topicPartition.toString)) + } + + private def testWithAllOffsetFetchingSQLConf(name: String)(func: => Any): Unit = { + Seq("true", "false").foreach { useDeprecatedOffsetFetching => + val testName = s"$name with useDeprecatedOffsetFetching $useDeprecatedOffsetFetching" + executeFuncWithSQLConf(testName, useDeprecatedOffsetFetching, func) + } + } + + private def executeFuncWithSQLConf( + name: String, + useDeprecatedOffsetFetching: String, + func: => Any): Unit = { + test(name) { + withSQLConf(SQLConf.USE_DEPRECATED_KAFKA_OFFSET_FETCHING.key -> useDeprecatedOffsetFetching) { + func + } + } } } diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala index 6f5dc0bb081ba..16fa24a68abe2 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala @@ -22,8 +22,6 @@ import java.util.Locale import java.util.concurrent.atomic.AtomicInteger import scala.annotation.tailrec -import scala.collection.JavaConverters._ -import scala.util.Random import org.apache.kafka.clients.producer.ProducerRecord import org.apache.kafka.common.TopicPartition @@ -465,41 +463,6 @@ abstract class KafkaRelationSuiteBase extends QueryTest with SharedSparkSession testBadOptions("subscribePattern" -> "")("pattern to subscribe is empty") } - test("allow group.id prefix") { - testGroupId("groupIdPrefix", (expected, actual) => { - assert(actual.exists(_.startsWith(expected)) && !actual.exists(_ === expected), - "Valid consumer groups don't contain the expected group id - " + - s"Valid consumer groups: $actual / expected group id: $expected") - }) - } - - test("allow group.id override") { - testGroupId("kafka.group.id", (expected, actual) => { - assert(actual.exists(_ === expected), "Valid consumer groups don't " + - s"contain the expected group id - Valid consumer groups: $actual / " + - s"expected group id: $expected") - }) - } - - private def testGroupId(groupIdKey: String, - validateGroupId: (String, Iterable[String]) => Unit): Unit = { - // Tests code path KafkaSourceProvider.createRelation(.) - val topic = newTopic() - testUtils.createTopic(topic, partitions = 3) - testUtils.sendMessages(topic, (1 to 10).map(_.toString).toArray, Some(0)) - testUtils.sendMessages(topic, (11 to 20).map(_.toString).toArray, Some(1)) - testUtils.sendMessages(topic, (21 to 30).map(_.toString).toArray, Some(2)) - - val customGroupId = "id-" + Random.nextInt() - val df = createDF(topic, withOptions = Map(groupIdKey -> customGroupId)) - checkAnswer(df, (1 to 30).map(_.toString).toDF()) - - val consumerGroups = testUtils.listConsumerGroups() - val validGroups = consumerGroups.valid().get() - val validGroupsId = validGroups.asScala.map(_.groupId()) - validateGroupId(customGroupId, validGroupsId) - } - test("read Kafka transactional messages: read_committed") { val topic = newTopic() testUtils.createTopic(topic) @@ -622,6 +585,16 @@ abstract class KafkaRelationSuiteBase extends QueryTest with SharedSparkSession } } +class KafkaRelationSuiteWithAdminV1 extends KafkaRelationSuiteV1 { + override protected def sparkConf: SparkConf = + super.sparkConf.set(SQLConf.USE_DEPRECATED_KAFKA_OFFSET_FETCHING.key, "false") +} + +class KafkaRelationSuiteWithAdminV2 extends KafkaRelationSuiteV2 { + override protected def sparkConf: SparkConf = + super.sparkConf.set(SQLConf.USE_DEPRECATED_KAFKA_OFFSET_FETCHING.key, "false") +} + class KafkaRelationSuiteV1 extends KafkaRelationSuiteBase { override protected def sparkConf: SparkConf = super diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index b2c28ffa984a9..979ddebc637f0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1415,6 +1415,17 @@ object SQLConf { .booleanConf .createWithDefault(true) + val USE_DEPRECATED_KAFKA_OFFSET_FETCHING = + buildConf("spark.sql.streaming.kafka.useDeprecatedOffsetFetching") + .internal() + .doc("When true, the deprecated Consumer based offset fetching used which could cause " + + "infinite wait in Spark queries. Such cases query restart is the only workaround. " + + "For further details please see Offset Fetching chapter of Structured Streaming Kafka " + + "Integration Guide.") + .version("3.1.0") + .booleanConf + .createWithDefault(true) + val STATEFUL_OPERATOR_CHECK_CORRECTNESS_ENABLED = buildConf("spark.sql.streaming.statefulOperator.checkCorrectness.enabled") .internal() @@ -3065,6 +3076,8 @@ class SQLConf extends Serializable with Logging { def isUnsupportedOperationCheckEnabled: Boolean = getConf(UNSUPPORTED_OPERATION_CHECK_ENABLED) + def useDeprecatedKafkaOffsetFetching: Boolean = getConf(USE_DEPRECATED_KAFKA_OFFSET_FETCHING) + def statefulOperatorCorrectnessCheckEnabled: Boolean = getConf(STATEFUL_OPERATOR_CHECK_CORRECTNESS_ENABLED) From d38883c1d811f57e5b9f07b29730b7ac6a6731ca Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 1 Dec 2020 11:38:42 +0000 Subject: [PATCH 064/150] [SPARK-32405][SQL][FOLLOWUP] Throw Exception if provider is specified in JDBCTableCatalog create table ### What changes were proposed in this pull request? Throw Exception if JDBC Table Catalog has provider in create table. ### Why are the changes needed? JDBC Table Catalog doesn't support provider and we should throw Exception. Previously CREATE TABLE syntax forces people to specify a provider so we have to add a `USING_`. Now the problem was fix and we will throw Exception for provider. ### Does this PR introduce _any_ user-facing change? Yes. We throw Exception if a provider is specified in CREATE TABLE for JDBC Table catalog. ### How was this patch tested? Existing tests (remove `USING _`) Closes #30544 from huaxingao/followup. Authored-by: Huaxin Gao Signed-off-by: Wenchen Fan --- .../v2/jdbc/JDBCTableCatalog.scala | 3 ++- .../v2/jdbc/JDBCTableCatalogSuite.scala | 27 +++++++++---------- .../apache/spark/sql/jdbc/JDBCV2Suite.scala | 21 +++++---------- 3 files changed, 22 insertions(+), 29 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalog.scala index e96b37e05c762..63f802363f7c0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalog.scala @@ -126,8 +126,9 @@ class JDBCTableCatalog extends TableCatalog with Logging { properties.asScala.map { case (k, v) => k match { case "comment" => tableComment = v - // ToDo: have a follow up to fail provider once unify create table syntax PR is merged case "provider" => + throw new AnalysisException("CREATE TABLE ... USING ... is not supported in" + + " JDBC catalog.") case "owner" => // owner is ignored. It is default to current user name. case "location" => throw new AnalysisException("CREATE TABLE ... LOCATION ... is not supported in" + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala index 97dd92acc7805..9e9df7db1e1c6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala @@ -153,21 +153,20 @@ class JDBCTableCatalogSuite extends QueryTest with SharedSparkSession { test("create a table") { withTable("h2.test.new_table") { - // TODO (SPARK-32427): Omit USING in CREATE TABLE - sql("CREATE TABLE h2.test.new_table(i INT, j STRING) USING _") + sql("CREATE TABLE h2.test.new_table(i INT, j STRING)") checkAnswer( sql("SHOW TABLES IN h2.test"), Seq(Row("test", "people"), Row("test", "new_table"))) } withTable("h2.test.new_table") { - sql("CREATE TABLE h2.test.new_table(i INT, j STRING) USING _") + sql("CREATE TABLE h2.test.new_table(i INT, j STRING)") val msg = intercept[AnalysisException] { - sql("CREATE TABLE h2.test.new_table(i INT, j STRING) USING _") + sql("CREATE TABLE h2.test.new_table(i INT, j STRING)") }.getMessage assert(msg.contains("Table test.new_table already exists")) } val exp = intercept[NoSuchNamespaceException] { - sql("CREATE TABLE h2.bad_test.new_table(i INT, j STRING) USING _") + sql("CREATE TABLE h2.bad_test.new_table(i INT, j STRING)") } assert(exp.getMessage.contains("Failed table creation: bad_test.new_table")) assert(exp.cause.get.getMessage.contains("Schema \"bad_test\" not found")) @@ -176,7 +175,7 @@ class JDBCTableCatalogSuite extends QueryTest with SharedSparkSession { test("ALTER TABLE ... add column") { val tableName = "h2.test.alt_table" withTable(tableName) { - sql(s"CREATE TABLE $tableName (ID INTEGER) USING _") + sql(s"CREATE TABLE $tableName (ID INTEGER)") sql(s"ALTER TABLE $tableName ADD COLUMNS (C1 INTEGER, C2 STRING)") var t = spark.table(tableName) var expectedSchema = new StructType() @@ -206,7 +205,7 @@ class JDBCTableCatalogSuite extends QueryTest with SharedSparkSession { test("ALTER TABLE ... rename column") { val tableName = "h2.test.alt_table" withTable(tableName) { - sql(s"CREATE TABLE $tableName (id INTEGER, C0 INTEGER) USING _") + sql(s"CREATE TABLE $tableName (id INTEGER, C0 INTEGER)") sql(s"ALTER TABLE $tableName RENAME COLUMN id TO C") val t = spark.table(tableName) val expectedSchema = new StructType() @@ -231,7 +230,7 @@ class JDBCTableCatalogSuite extends QueryTest with SharedSparkSession { test("ALTER TABLE ... drop column") { val tableName = "h2.test.alt_table" withTable(tableName) { - sql(s"CREATE TABLE $tableName (C1 INTEGER, C2 INTEGER, c3 INTEGER) USING _") + sql(s"CREATE TABLE $tableName (C1 INTEGER, C2 INTEGER, c3 INTEGER)") sql(s"ALTER TABLE $tableName DROP COLUMN C1") sql(s"ALTER TABLE $tableName DROP COLUMN c3") val t = spark.table(tableName) @@ -255,7 +254,7 @@ class JDBCTableCatalogSuite extends QueryTest with SharedSparkSession { test("ALTER TABLE ... update column type") { val tableName = "h2.test.alt_table" withTable(tableName) { - sql(s"CREATE TABLE $tableName (ID INTEGER, deptno INTEGER) USING _") + sql(s"CREATE TABLE $tableName (ID INTEGER, deptno INTEGER)") sql(s"ALTER TABLE $tableName ALTER COLUMN id TYPE DOUBLE") sql(s"ALTER TABLE $tableName ALTER COLUMN deptno TYPE DOUBLE") val t = spark.table(tableName) @@ -284,7 +283,7 @@ class JDBCTableCatalogSuite extends QueryTest with SharedSparkSession { test("ALTER TABLE ... update column nullability") { val tableName = "h2.test.alt_table" withTable(tableName) { - sql(s"CREATE TABLE $tableName (ID INTEGER NOT NULL, deptno INTEGER NOT NULL) USING _") + sql(s"CREATE TABLE $tableName (ID INTEGER NOT NULL, deptno INTEGER NOT NULL)") sql(s"ALTER TABLE $tableName ALTER COLUMN ID DROP NOT NULL") sql(s"ALTER TABLE $tableName ALTER COLUMN deptno DROP NOT NULL") val t = spark.table(tableName) @@ -309,7 +308,7 @@ class JDBCTableCatalogSuite extends QueryTest with SharedSparkSession { test("ALTER TABLE ... update column comment not supported") { val tableName = "h2.test.alt_table" withTable(tableName) { - sql(s"CREATE TABLE $tableName (ID INTEGER) USING _") + sql(s"CREATE TABLE $tableName (ID INTEGER)") val exp = intercept[AnalysisException] { sql(s"ALTER TABLE $tableName ALTER COLUMN ID COMMENT 'test'") } @@ -333,7 +332,7 @@ class JDBCTableCatalogSuite extends QueryTest with SharedSparkSession { test("ALTER TABLE case sensitivity") { val tableName = "h2.test.alt_table" withTable(tableName) { - sql(s"CREATE TABLE $tableName (c1 INTEGER NOT NULL, c2 INTEGER) USING _") + sql(s"CREATE TABLE $tableName (c1 INTEGER NOT NULL, c2 INTEGER)") var t = spark.table(tableName) var expectedSchema = new StructType().add("c1", IntegerType).add("c2", IntegerType) assert(t.schema === expectedSchema) @@ -400,7 +399,7 @@ class JDBCTableCatalogSuite extends QueryTest with SharedSparkSession { withTable("h2.test.new_table") { val logAppender = new LogAppender("table comment") withLogAppender(logAppender) { - sql("CREATE TABLE h2.test.new_table(i INT, j STRING) USING _ COMMENT 'this is a comment'") + sql("CREATE TABLE h2.test.new_table(i INT, j STRING) COMMENT 'this is a comment'") } val createCommentWarning = logAppender.loggingEvents .filter(_.getLevel == Level.WARN) @@ -413,7 +412,7 @@ class JDBCTableCatalogSuite extends QueryTest with SharedSparkSession { test("CREATE TABLE with table property") { withTable("h2.test.new_table") { val m = intercept[AnalysisException] { - sql("CREATE TABLE h2.test.new_table(i INT, j STRING) USING _" + + sql("CREATE TABLE h2.test.new_table(i INT, j STRING)" + " TBLPROPERTIES('ENGINE'='tableEngineName')") }.cause.get.getMessage assert(m.contains("\"TABLEENGINENAME\" not found")) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala index 3bcacd03b4a0d..e8157e552d754 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala @@ -111,7 +111,7 @@ class JDBCV2Suite extends QueryTest with SharedSparkSession { test("read/write with partition info") { withTable("h2.test.abc") { - sql("CREATE TABLE h2.test.abc USING _ AS SELECT * FROM h2.test.people") + sql("CREATE TABLE h2.test.abc AS SELECT * FROM h2.test.people") val df1 = Seq(("evan", 3), ("cathy", 4), ("alex", 5)).toDF("NAME", "ID") val e = intercept[IllegalArgumentException] { df1.write @@ -148,11 +148,9 @@ class JDBCV2Suite extends QueryTest with SharedSparkSession { Seq(Row("test", "people"), Row("test", "empty_table"))) } - // TODO (SPARK-32603): Operation not allowed: CREATE TABLE ... STORED AS ... does not support - // multi-part identifiers test("SQL API: create table as select") { withTable("h2.test.abc") { - sql("CREATE TABLE h2.test.abc USING _ AS SELECT * FROM h2.test.people") + sql("CREATE TABLE h2.test.abc AS SELECT * FROM h2.test.people") checkAnswer(sql("SELECT name, id FROM h2.test.abc"), Seq(Row("fred", 1), Row("mary", 2))) } } @@ -164,15 +162,14 @@ class JDBCV2Suite extends QueryTest with SharedSparkSession { } } - // TODO (SPARK-32603): ParseException: mismatched input 'AS' expecting {'(', 'USING'} test("SQL API: replace table as select") { withTable("h2.test.abc") { intercept[CannotReplaceMissingTableException] { - sql("REPLACE TABLE h2.test.abc USING _ AS SELECT 1 as col") + sql("REPLACE TABLE h2.test.abc AS SELECT 1 as col") } - sql("CREATE OR REPLACE TABLE h2.test.abc USING _ AS SELECT 1 as col") + sql("CREATE OR REPLACE TABLE h2.test.abc AS SELECT 1 as col") checkAnswer(sql("SELECT col FROM h2.test.abc"), Row(1)) - sql("REPLACE TABLE h2.test.abc USING _ AS SELECT * FROM h2.test.people") + sql("REPLACE TABLE h2.test.abc AS SELECT * FROM h2.test.people") checkAnswer(sql("SELECT name, id FROM h2.test.abc"), Seq(Row("fred", 1), Row("mary", 2))) } } @@ -189,11 +186,9 @@ class JDBCV2Suite extends QueryTest with SharedSparkSession { } } - // TODO (SPARK-32603): Operation not allowed: CREATE TABLE ... STORED AS ... does not support - // multi-part identifiers test("SQL API: insert and overwrite") { withTable("h2.test.abc") { - sql("CREATE TABLE h2.test.abc USING _ AS SELECT * FROM h2.test.people") + sql("CREATE TABLE h2.test.abc AS SELECT * FROM h2.test.people") sql("INSERT INTO h2.test.abc SELECT 'lucy', 3") checkAnswer( @@ -205,11 +200,9 @@ class JDBCV2Suite extends QueryTest with SharedSparkSession { } } - // TODO (SPARK-32603): Operation not allowed: CREATE TABLE ... STORED AS ... does not support - // multi-part identifiers test("DataFrameWriterV2: insert and overwrite") { withTable("h2.test.abc") { - sql("CREATE TABLE h2.test.abc USING _ AS SELECT * FROM h2.test.people") + sql("CREATE TABLE h2.test.abc AS SELECT * FROM h2.test.people") // `DataFrameWriterV2` is by-name. sql("SELECT 3 AS ID, 'lucy' AS NAME").writeTo("h2.test.abc").append() From 9273d4250ddd5e011487a5a942c1b4d0f0412f78 Mon Sep 17 00:00:00 2001 From: gengjiaan Date: Tue, 1 Dec 2020 11:48:30 +0000 Subject: [PATCH 065/150] [SPARK-33045][SQL][FOLLOWUP] Support built-in function like_any and fix StackOverflowError issue ### What changes were proposed in this pull request? Spark already support `LIKE ANY` syntax, but it will throw `StackOverflowError` if there are many elements(more than 14378 elements). We should implement built-in function for LIKE ANY to fix this issue. Why the stack overflow can happen in the current approach ? The current approach uses reduceLeft to connect each `Like(e, p)`, this will lead the the call depth of the thread is too large, causing `StackOverflowError` problems. Why the fix in this PR can avoid the error? This PR support built-in function for `LIKE ANY` and avoid this issue. ### Why are the changes needed? 1.Fix the `StackOverflowError` issue. 2.Support built-in function `like_any`. ### Does this PR introduce _any_ user-facing change? 'No'. ### How was this patch tested? Jenkins test. Closes #30465 from beliefer/SPARK-33045-like_any-bak. Lead-authored-by: gengjiaan Co-authored-by: beliefer Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/dsl/package.scala | 4 + .../expressions/regexpExpressions.scala | 98 ++++++++++++++++--- .../sql/catalyst/parser/AstBuilder.scala | 31 +++--- .../apache/spark/sql/internal/SQLConf.scala | 14 --- .../expressions/RegexpExpressionsSuite.scala | 26 +++++ .../parser/ExpressionParserSuite.scala | 12 +-- .../resources/sql-tests/inputs/like-all.sql | 2 - .../resources/sql-tests/inputs/like-any.sql | 2 + 8 files changed, 138 insertions(+), 51 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index 89cf97e76d798..2bcbdf6512389 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -107,6 +107,10 @@ package object dsl { LikeAll(expr, others.map(_.eval(EmptyRow).asInstanceOf[UTF8String])) def notLikeAll(others: Expression*): Expression = NotLikeAll(expr, others.map(_.eval(EmptyRow).asInstanceOf[UTF8String])) + def likeAny(others: Expression*): Expression = + LikeAny(expr, others.map(_.eval(EmptyRow).asInstanceOf[UTF8String])) + def notLikeAny(others: Expression*): Expression = + NotLikeAny(expr, others.map(_.eval(EmptyRow).asInstanceOf[UTF8String])) def contains(other: Expression): Expression = Contains(expr, other) def startsWith(other: Expression): Expression = StartsWith(expr, other) def endsWith(other: Expression): Expression = EndsWith(expr, other) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index b4d9921488d5f..0b94fe8b5d47e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -180,14 +180,12 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) } } -/** - * Optimized version of LIKE ALL, when all pattern values are literal. - */ -abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { +abstract class MultiLikeBase + extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { protected def patterns: Seq[UTF8String] - protected def isNotLikeAll: Boolean + protected def isNotSpecified: Boolean override def inputTypes: Seq[DataType] = StringType :: Nil @@ -195,27 +193,39 @@ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes w override def nullable: Boolean = true - private lazy val hasNull: Boolean = patterns.contains(null) + protected lazy val hasNull: Boolean = patterns.contains(null) - private lazy val cache = patterns.filterNot(_ == null) + protected lazy val cache = patterns.filterNot(_ == null) .map(s => Pattern.compile(StringUtils.escapeLikeRegex(s.toString, '\\'))) - private lazy val matchFunc = if (isNotLikeAll) { + protected lazy val matchFunc = if (isNotSpecified) { (p: Pattern, inputValue: String) => !p.matcher(inputValue).matches() } else { (p: Pattern, inputValue: String) => p.matcher(inputValue).matches() } + protected def matches(exprValue: String): Any + override def eval(input: InternalRow): Any = { val exprValue = child.eval(input) if (exprValue == null) { null } else { - if (cache.forall(matchFunc(_, exprValue.toString))) { - if (hasNull) null else true - } else { - false - } + matches(exprValue.toString) + } + } +} + +/** + * Optimized version of LIKE ALL, when all pattern values are literal. + */ +abstract class LikeAllBase extends MultiLikeBase { + + override def matches(exprValue: String): Any = { + if (cache.forall(matchFunc(_, exprValue))) { + if (hasNull) null else true + } else { + false } } @@ -227,7 +237,7 @@ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes w val valueArg = ctx.freshName("valueArg") val patternCache = ctx.addReferenceObj("patternCache", cache.asJava) - val checkNotMatchCode = if (isNotLikeAll) { + val checkNotMatchCode = if (isNotSpecified) { s"$pattern.matcher($valueArg.toString()).matches()" } else { s"!$pattern.matcher($valueArg.toString()).matches()" @@ -255,11 +265,67 @@ abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes w } case class LikeAll(child: Expression, patterns: Seq[UTF8String]) extends LikeAllBase { - override def isNotLikeAll: Boolean = false + override def isNotSpecified: Boolean = false } case class NotLikeAll(child: Expression, patterns: Seq[UTF8String]) extends LikeAllBase { - override def isNotLikeAll: Boolean = true + override def isNotSpecified: Boolean = true +} + +/** + * Optimized version of LIKE ANY, when all pattern values are literal. + */ +abstract class LikeAnyBase extends MultiLikeBase { + + override def matches(exprValue: String): Any = { + if (cache.exists(matchFunc(_, exprValue))) { + true + } else { + if (hasNull) null else false + } + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val eval = child.genCode(ctx) + val patternClass = classOf[Pattern].getName + val javaDataType = CodeGenerator.javaType(child.dataType) + val pattern = ctx.freshName("pattern") + val valueArg = ctx.freshName("valueArg") + val patternCache = ctx.addReferenceObj("patternCache", cache.asJava) + + val checkMatchCode = if (isNotSpecified) { + s"!$pattern.matcher($valueArg.toString()).matches()" + } else { + s"$pattern.matcher($valueArg.toString()).matches()" + } + + ev.copy(code = + code""" + |${eval.code} + |boolean ${ev.isNull} = false; + |boolean ${ev.value} = false; + |if (${eval.isNull}) { + | ${ev.isNull} = true; + |} else { + | $javaDataType $valueArg = ${eval.value}; + | for ($patternClass $pattern: $patternCache) { + | if ($checkMatchCode) { + | ${ev.value} = true; + | break; + | } + | } + | if (!${ev.value} && $hasNull) ${ev.isNull} = true; + |} + """.stripMargin) + } +} + +case class LikeAny(child: Expression, patterns: Seq[UTF8String]) extends LikeAnyBase { + override def isNotSpecified: Boolean = false +} + +case class NotLikeAny(child: Expression, patterns: Seq[UTF8String]) extends LikeAnyBase { + override def isNotSpecified: Boolean = true } // scalastyle:off line.contains.tab diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index ff8b56f0b724b..3788e1631c3dd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1396,14 +1396,6 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg case other => Seq(other) } - def getLikeQuantifierExprs(expressions: java.util.List[ExpressionContext]): Seq[Expression] = { - if (expressions.isEmpty) { - throw new ParseException("Expected something between '(' and ')'.", ctx) - } else { - expressions.asScala.map(expression).map(p => invertIfNotDefined(new Like(e, p))).toSeq - } - } - // Create the predicate. ctx.kind.getType match { case SqlBaseParser.BETWEEN => @@ -1418,12 +1410,24 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg case SqlBaseParser.LIKE => Option(ctx.quantifier).map(_.getType) match { case Some(SqlBaseParser.ANY) | Some(SqlBaseParser.SOME) => - getLikeQuantifierExprs(ctx.expression).reduceLeft(Or) + validate(!ctx.expression.isEmpty, "Expected something between '(' and ')'.", ctx) + val expressions = expressionList(ctx.expression) + if (expressions.forall(_.foldable) && expressions.forall(_.dataType == StringType)) { + // If there are many pattern expressions, will throw StackOverflowError. + // So we use LikeAny or NotLikeAny instead. + val patterns = expressions.map(_.eval(EmptyRow).asInstanceOf[UTF8String]) + ctx.NOT match { + case null => LikeAny(e, patterns.toSeq) + case _ => NotLikeAny(e, patterns.toSeq) + } + } else { + ctx.expression.asScala.map(expression) + .map(p => invertIfNotDefined(new Like(e, p))).toSeq.reduceLeft(Or) + } case Some(SqlBaseParser.ALL) => validate(!ctx.expression.isEmpty, "Expected something between '(' and ')'.", ctx) - val expressions = ctx.expression.asScala.map(expression) - if (expressions.size > SQLConf.get.optimizerLikeAllConversionThreshold && - expressions.forall(_.foldable) && expressions.forall(_.dataType == StringType)) { + val expressions = expressionList(ctx.expression) + if (expressions.forall(_.foldable) && expressions.forall(_.dataType == StringType)) { // If there are many pattern expressions, will throw StackOverflowError. // So we use LikeAll or NotLikeAll instead. val patterns = expressions.map(_.eval(EmptyRow).asInstanceOf[UTF8String]) @@ -1432,7 +1436,8 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg case _ => NotLikeAll(e, patterns.toSeq) } } else { - getLikeQuantifierExprs(ctx.expression).reduceLeft(And) + ctx.expression.asScala.map(expression) + .map(p => invertIfNotDefined(new Like(e, p))).toSeq.reduceLeft(And) } case _ => val escapeChar = Option(ctx.escapeChar).map(string).map { str => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 979ddebc637f0..a1d6f9f608873 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -216,18 +216,6 @@ object SQLConf { "for using switch statements in InSet must be non-negative and less than or equal to 600") .createWithDefault(400) - val OPTIMIZER_LIKE_ALL_CONVERSION_THRESHOLD = - buildConf("spark.sql.optimizer.likeAllConversionThreshold") - .internal() - .doc("Configure the maximum size of the pattern sequence in like all. Spark will convert " + - "the logical combination of like to avoid StackOverflowError. 200 is an empirical value " + - "that will not cause StackOverflowError.") - .version("3.1.0") - .intConf - .checkValue(threshold => threshold >= 0, "The maximum size of pattern sequence " + - "in like all must be non-negative") - .createWithDefault(200) - val PLAN_CHANGE_LOG_LEVEL = buildConf("spark.sql.planChangeLog.level") .internal() .doc("Configures the log level for logging the change from the original plan to the new " + @@ -3048,8 +3036,6 @@ class SQLConf extends Serializable with Logging { def optimizerInSetSwitchThreshold: Int = getConf(OPTIMIZER_INSET_SWITCH_THRESHOLD) - def optimizerLikeAllConversionThreshold: Int = getConf(OPTIMIZER_LIKE_ALL_CONVERSION_THRESHOLD) - def planChangeLogLevel: String = getConf(PLAN_CHANGE_LOG_LEVEL) def planChangeRules: Option[String] = getConf(PLAN_CHANGE_LOG_RULES) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index cc5ab5dc7b4e0..8d7501d952ecb 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -72,6 +72,32 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { .notLikeAll(Literal.create(null, StringType), "%yoo%"), null) } + test("LIKE ANY") { + checkEvaluation(Literal.create(null, StringType).likeAny("%foo%", "%oo"), null) + checkEvaluation(Literal.create("foo", StringType).likeAny("%foo%", "%oo"), true) + checkEvaluation(Literal.create("foo", StringType).likeAny("%foo%", "%bar%"), true) + checkEvaluation(Literal.create("foo", StringType).likeAny("%fee%", "%bar%"), false) + checkEvaluation(Literal.create("foo", StringType) + .likeAny("%foo%", Literal.create(null, StringType)), true) + checkEvaluation(Literal.create("foo", StringType) + .likeAny(Literal.create(null, StringType), "%foo%"), true) + checkEvaluation(Literal.create("foo", StringType) + .likeAny("%feo%", Literal.create(null, StringType)), null) + checkEvaluation(Literal.create("foo", StringType) + .likeAny(Literal.create(null, StringType), "%feo%"), null) + checkEvaluation(Literal.create("foo", StringType).notLikeAny("tee", "%yoo%"), true) + checkEvaluation(Literal.create("foo", StringType).notLikeAny("%oo%", "%yoo%"), true) + checkEvaluation(Literal.create("foo", StringType).notLikeAny("%foo%", "%oo"), false) + checkEvaluation(Literal.create("foo", StringType) + .notLikeAny("%foo%", Literal.create(null, StringType)), null) + checkEvaluation(Literal.create("foo", StringType) + .notLikeAny(Literal.create(null, StringType), "%foo%"), null) + checkEvaluation(Literal.create("foo", StringType) + .notLikeAny("%yoo%", Literal.create(null, StringType)), true) + checkEvaluation(Literal.create("foo", StringType) + .notLikeAny(Literal.create(null, StringType), "%yoo%"), true) + } + test("LIKE Pattern") { // null handling diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala index b1d0d044eaead..9f6a76b9228c5 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala @@ -210,13 +210,13 @@ class ExpressionParserSuite extends AnalysisTest { test("(NOT) LIKE (ANY | SOME | ALL) expressions") { Seq("any", "some").foreach { quantifier => - assertEqual(s"a like $quantifier ('foo%', 'b%')", ('a like "foo%") || ('a like "b%")) - assertEqual(s"a not like $quantifier ('foo%', 'b%')", !('a like "foo%") || !('a like "b%")) - assertEqual(s"not (a like $quantifier ('foo%', 'b%'))", !(('a like "foo%") || ('a like "b%"))) + assertEqual(s"a like $quantifier ('foo%', 'b%')", 'a likeAny("foo%", "b%")) + assertEqual(s"a not like $quantifier ('foo%', 'b%')", 'a notLikeAny("foo%", "b%")) + assertEqual(s"not (a like $quantifier ('foo%', 'b%'))", !('a likeAny("foo%", "b%"))) } - assertEqual("a like all ('foo%', 'b%')", ('a like "foo%") && ('a like "b%")) - assertEqual("a not like all ('foo%', 'b%')", !('a like "foo%") && !('a like "b%")) - assertEqual("not (a like all ('foo%', 'b%'))", !(('a like "foo%") && ('a like "b%"))) + assertEqual("a like all ('foo%', 'b%')", 'a likeAll("foo%", "b%")) + assertEqual("a not like all ('foo%', 'b%')", 'a notLikeAll("foo%", "b%")) + assertEqual("not (a like all ('foo%', 'b%'))", !('a likeAll("foo%", "b%"))) Seq("any", "some", "all").foreach { quantifier => intercept(s"a like $quantifier()", "Expected something between '(' and ')'") diff --git a/sql/core/src/test/resources/sql-tests/inputs/like-all.sql b/sql/core/src/test/resources/sql-tests/inputs/like-all.sql index f83277376e680..51b689607e8e3 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/like-all.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/like-all.sql @@ -1,6 +1,4 @@ -- test cases for like all ---CONFIG_DIM1 spark.sql.optimizer.likeAllConversionThreshold=0 ---CONFIG_DIM1 spark.sql.optimizer.likeAllConversionThreshold=200 CREATE OR REPLACE TEMPORARY VIEW like_all_table AS SELECT * FROM (VALUES ('google', '%oo%'), diff --git a/sql/core/src/test/resources/sql-tests/inputs/like-any.sql b/sql/core/src/test/resources/sql-tests/inputs/like-any.sql index 5758a2a494944..a6e9827d58d94 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/like-any.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/like-any.sql @@ -1,3 +1,5 @@ +-- test cases for like any + CREATE OR REPLACE TEMPORARY VIEW like_any_table AS SELECT * FROM (VALUES ('google', '%oo%'), ('facebook', '%oo%'), From cf4ad212b100901b7065f2db8c1688c83423141d Mon Sep 17 00:00:00 2001 From: Prakhar Jain Date: Tue, 1 Dec 2020 21:13:27 +0900 Subject: [PATCH 066/150] [SPARK-33503][SQL] Refactor SortOrder class to allow multiple childrens ### What changes were proposed in this pull request? This is a followup of #30302 . As part of this PR, sameOrderExpressions set is made part of children of SortOrder node - so that they don't need any special handling as done in #30302 . ### Why are the changes needed? sameOrderExpressions should get same treatment as child. So making them part of children helps in transforming them easily. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing UTs Closes #30430 from prakharjain09/SPARK-33400-sortorder-refactor. Authored-by: Prakhar Jain Signed-off-by: Takeshi Yamamuro --- .../sql/catalyst/analysis/Analyzer.scala | 2 +- .../spark/sql/catalyst/dsl/package.scala | 4 +-- .../sql/catalyst/expressions/SortOrder.scala | 10 ++++--- .../sql/catalyst/parser/AstBuilder.scala | 2 +- .../scala/org/apache/spark/sql/Column.scala | 8 +++--- .../AliasAwareOutputExpression.scala | 6 +---- .../execution/joins/SortMergeJoinExec.scala | 9 ++++--- .../spark/sql/execution/PlannerSuite.scala | 26 +++++++++++++++++++ 8 files changed, 46 insertions(+), 21 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index abd38f2f9d940..6b06cf13262d4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -1822,7 +1822,7 @@ class Analyzer(override val catalogManager: CatalogManager) val newOrders = orders map { case s @ SortOrder(UnresolvedOrdinal(index), direction, nullOrdering, _) => if (index > 0 && index <= child.output.size) { - SortOrder(child.output(index - 1), direction, nullOrdering, Set.empty) + SortOrder(child.output(index - 1), direction, nullOrdering, Seq.empty) } else { s.failAnalysis( s"ORDER BY position $index is not in select list " + diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index 2bcbdf6512389..5a778d2785a67 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -135,9 +135,9 @@ package object dsl { } def asc: SortOrder = SortOrder(expr, Ascending) - def asc_nullsLast: SortOrder = SortOrder(expr, Ascending, NullsLast, Set.empty) + def asc_nullsLast: SortOrder = SortOrder(expr, Ascending, NullsLast, Seq.empty) def desc: SortOrder = SortOrder(expr, Descending) - def desc_nullsFirst: SortOrder = SortOrder(expr, Descending, NullsFirst, Set.empty) + def desc_nullsFirst: SortOrder = SortOrder(expr, Descending, NullsFirst, Seq.empty) def as(alias: String): NamedExpression = Alias(expr, alias)() def as(alias: Symbol): NamedExpression = Alias(expr, alias.name)() } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala index 54259e713accd..d9923b5d022e0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SortOrder.scala @@ -63,8 +63,10 @@ case class SortOrder( child: Expression, direction: SortDirection, nullOrdering: NullOrdering, - sameOrderExpressions: Set[Expression]) - extends UnaryExpression with Unevaluable { + sameOrderExpressions: Seq[Expression]) + extends Expression with Unevaluable { + + override def children: Seq[Expression] = child +: sameOrderExpressions override def checkInputDataTypes(): TypeCheckResult = { if (RowOrdering.isOrderable(dataType)) { @@ -83,7 +85,7 @@ case class SortOrder( def isAscending: Boolean = direction == Ascending def satisfies(required: SortOrder): Boolean = { - (sameOrderExpressions + child).exists(required.child.semanticEquals) && + children.exists(required.child.semanticEquals) && direction == required.direction && nullOrdering == required.nullOrdering } } @@ -92,7 +94,7 @@ object SortOrder { def apply( child: Expression, direction: SortDirection, - sameOrderExpressions: Set[Expression] = Set.empty): SortOrder = { + sameOrderExpressions: Seq[Expression] = Seq.empty): SortOrder = { new SortOrder(child, direction, direction.defaultNullOrdering, sameOrderExpressions) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 3788e1631c3dd..12c5e0de686fa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1910,7 +1910,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } else { direction.defaultNullOrdering } - SortOrder(expression(ctx.expression), direction, nullOrdering, Set.empty) + SortOrder(expression(ctx.expression), direction, nullOrdering, Seq.empty) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index b3e403ffa7382..95134d9111593 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -1228,7 +1228,7 @@ class Column(val expr: Expression) extends Logging { * @group expr_ops * @since 2.1.0 */ - def desc_nulls_first: Column = withExpr { SortOrder(expr, Descending, NullsFirst, Set.empty) } + def desc_nulls_first: Column = withExpr { SortOrder(expr, Descending, NullsFirst, Seq.empty) } /** * Returns a sort expression based on the descending order of the column, @@ -1244,7 +1244,7 @@ class Column(val expr: Expression) extends Logging { * @group expr_ops * @since 2.1.0 */ - def desc_nulls_last: Column = withExpr { SortOrder(expr, Descending, NullsLast, Set.empty) } + def desc_nulls_last: Column = withExpr { SortOrder(expr, Descending, NullsLast, Seq.empty) } /** * Returns a sort expression based on ascending order of the column. @@ -1275,7 +1275,7 @@ class Column(val expr: Expression) extends Logging { * @group expr_ops * @since 2.1.0 */ - def asc_nulls_first: Column = withExpr { SortOrder(expr, Ascending, NullsFirst, Set.empty) } + def asc_nulls_first: Column = withExpr { SortOrder(expr, Ascending, NullsFirst, Seq.empty) } /** * Returns a sort expression based on ascending order of the column, @@ -1291,7 +1291,7 @@ class Column(val expr: Expression) extends Logging { * @group expr_ops * @since 2.1.0 */ - def asc_nulls_last: Column = withExpr { SortOrder(expr, Ascending, NullsLast, Set.empty) } + def asc_nulls_last: Column = withExpr { SortOrder(expr, Ascending, NullsLast, Seq.empty) } /** * Prints the expression to the console for debugging purposes. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/AliasAwareOutputExpression.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/AliasAwareOutputExpression.scala index 3ba8745be995f..3cbe1654ea2cd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/AliasAwareOutputExpression.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/AliasAwareOutputExpression.scala @@ -65,11 +65,7 @@ trait AliasAwareOutputOrdering extends AliasAwareOutputExpression { final override def outputOrdering: Seq[SortOrder] = { if (hasAlias) { - orderingExpressions.map { sortOrder => - val newSortOrder = normalizeExpression(sortOrder).asInstanceOf[SortOrder] - val newSameOrderExpressions = newSortOrder.sameOrderExpressions.map(normalizeExpression) - newSortOrder.copy(sameOrderExpressions = newSameOrderExpressions) - } + orderingExpressions.map(normalizeExpression(_).asInstanceOf[SortOrder]) } else { orderingExpressions } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala index 6e59ad07d7168..eabbdc8ed3243 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala @@ -68,9 +68,9 @@ case class SortMergeJoinExec( val leftKeyOrdering = getKeyOrdering(leftKeys, left.outputOrdering) val rightKeyOrdering = getKeyOrdering(rightKeys, right.outputOrdering) leftKeyOrdering.zip(rightKeyOrdering).map { case (lKey, rKey) => - // Also add the right key and its `sameOrderExpressions` - SortOrder(lKey.child, Ascending, lKey.sameOrderExpressions + rKey.child ++ rKey - .sameOrderExpressions) + // Also add expressions from right side sort order + val sameOrderExpressions = ExpressionSet(lKey.sameOrderExpressions ++ rKey.children) + SortOrder(lKey.child, Ascending, sameOrderExpressions.toSeq) } // For left and right outer joins, the output is ordered by the streamed input's join keys. case LeftOuter => getKeyOrdering(leftKeys, left.outputOrdering) @@ -96,7 +96,8 @@ case class SortMergeJoinExec( val requiredOrdering = requiredOrders(keys) if (SortOrder.orderingSatisfies(childOutputOrdering, requiredOrdering)) { keys.zip(childOutputOrdering).map { case (key, childOrder) => - SortOrder(key, Ascending, childOrder.sameOrderExpressions + childOrder.child - key) + val sameOrderExpressionsSet = ExpressionSet(childOrder.children) - key + SortOrder(key, Ascending, sameOrderExpressionsSet.toSeq) } } else { requiredOrdering diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala index 6de81cc414d7d..5e30f846307ae 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala @@ -1090,6 +1090,32 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { } } + test("sort order doesn't have repeated expressions") { + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withTempView("t1", "t2") { + spark.range(10).repartition($"id").createTempView("t1") + spark.range(20).repartition($"id").createTempView("t2") + val planned = sql( + """ + | SELECT t12.id, t1.id + | FROM (SELECT t1.id FROM t1, t2 WHERE t1.id * 2 = t2.id) t12, t1 + | where 2 * t12.id = t1.id + """.stripMargin).queryExecution.executedPlan + + // t12 is already sorted on `t1.id * 2`. and we need to sort it on `2 * t12.id` + // for 2nd join. So sorting on t12 can be avoided + val sortNodes = planned.collect { case s: SortExec => s } + assert(sortNodes.size == 3) + val outputOrdering = planned.outputOrdering + assert(outputOrdering.size == 1) + // Sort order should have 3 childrens, not 4. This is because t1.id*2 and 2*t1.id are same + assert(outputOrdering.head.children.size == 3) + assert(outputOrdering.head.children.count(_.isInstanceOf[AttributeReference]) == 2) + assert(outputOrdering.head.children.count(_.isInstanceOf[Multiply]) == 1) + } + } + } + test("aliases to expressions should not be replaced") { withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { withTempView("df1", "df2") { From 478fb7f5280d8da2c68b858114eda358708e681b Mon Sep 17 00:00:00 2001 From: Anton Okolnychyi Date: Tue, 1 Dec 2020 14:11:01 +0000 Subject: [PATCH 067/150] [SPARK-33608][SQL] Handle DELETE/UPDATE/MERGE in PullupCorrelatedPredicates ### What changes were proposed in this pull request? This PR adds logic to handle DELETE/UPDATE/MERGE plans in `PullupCorrelatedPredicates`. ### Why are the changes needed? Right now, `PullupCorrelatedPredicates` applies only to filters and unary nodes. As a result, correlated predicates in DELETE/UPDATE/MERGE are not rewritten. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? The PR adds 3 new test cases. Closes #30555 from aokolnychyi/spark-33608. Authored-by: Anton Okolnychyi Signed-off-by: Wenchen Fan --- .../sql/catalyst/optimizer/subquery.scala | 2 + .../PullupCorrelatedPredicatesSuite.scala | 64 ++++++++++++++++++- 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala index 11532d22204a4..3c2ee3149d317 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala @@ -328,6 +328,8 @@ object PullupCorrelatedPredicates extends Rule[LogicalPlan] with PredicateHelper // Only a few unary nodes (Project/Filter/Aggregate) can contain subqueries. case q: UnaryNode => rewriteSubQueries(q, q.children) + case s: SupportsSubquery => + rewriteSubQueries(s, s.children) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala index 17dfc7f3f18f7..ae9a694b50444 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest -import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} +import org.apache.spark.sql.catalyst.plans.logical.{Assignment, DeleteAction, DeleteFromTable, InsertAction, LocalRelation, LogicalPlan, MergeIntoTable, UpdateTable} import org.apache.spark.sql.catalyst.rules.RuleExecutor class PullupCorrelatedPredicatesSuite extends PlanTest { @@ -98,4 +98,66 @@ class PullupCorrelatedPredicatesSuite extends PlanTest { val doubleOptimized = Optimize.execute(optimized) comparePlans(optimized, doubleOptimized, false) } + + test("PullupCorrelatedPredicates should handle deletes") { + val subPlan = testRelation2.where('a === 'c).select('c) + val cond = InSubquery(Seq('a), ListQuery(subPlan)) + val deletePlan = DeleteFromTable(testRelation, Some(cond)).analyze + assert(deletePlan.resolved) + + val optimized = Optimize.execute(deletePlan) + assert(optimized.resolved) + + optimized match { + case DeleteFromTable(_, Some(s: InSubquery)) => + val outerRefs = SubExprUtils.getOuterReferences(s.query.plan) + assert(outerRefs.isEmpty, "should be no outer refs") + case other => + fail(s"unexpected logical plan: $other") + } + } + + test("PullupCorrelatedPredicates should handle updates") { + val subPlan = testRelation2.where('a === 'c).select('c) + val cond = InSubquery(Seq('a), ListQuery(subPlan)) + val updatePlan = UpdateTable(testRelation, Seq.empty, Some(cond)).analyze + assert(updatePlan.resolved) + + val optimized = Optimize.execute(updatePlan) + assert(optimized.resolved) + + optimized match { + case UpdateTable(_, _, Some(s: InSubquery)) => + val outerRefs = SubExprUtils.getOuterReferences(s.query.plan) + assert(outerRefs.isEmpty, "should be no outer refs") + case other => + fail(s"unexpected logical plan: $other") + } + } + + test("PullupCorrelatedPredicates should handle merge") { + val testRelation3 = LocalRelation('e.int, 'f.double) + val subPlan = testRelation3.where('a === 'e).select('e) + val cond = InSubquery(Seq('a), ListQuery(subPlan)) + + val mergePlan = MergeIntoTable( + testRelation, + testRelation2, + cond, + Seq(DeleteAction(None)), + Seq(InsertAction(None, Seq(Assignment('a, 'c), Assignment('b, 'd))))) + val analyzedMergePlan = mergePlan.analyze + assert(analyzedMergePlan.resolved) + + val optimized = Optimize.execute(analyzedMergePlan) + assert(optimized.resolved) + + optimized match { + case MergeIntoTable(_, _, s: InSubquery, _, _) => + val outerRefs = SubExprUtils.getOuterReferences(s.query.plan) + assert(outerRefs.isEmpty, "should be no outer refs") + case other => + fail(s"unexpected logical plan: $other") + } + } } From c24f2b2d6afb411fbfffb90fa87150f3b6912343 Mon Sep 17 00:00:00 2001 From: Anton Okolnychyi Date: Tue, 1 Dec 2020 09:27:46 -0800 Subject: [PATCH 068/150] [SPARK-33612][SQL] Add dataSourceRewriteRules batch to Optimizer ### What changes were proposed in this pull request? This PR adds a new batch to the optimizer for executing rules that rewrite plans for data sources. ### Why are the changes needed? Right now, we have a special place in the optimizer where we construct v2 scans. As time shows, we need more rewrite rules that would be executed after the operator optimization and before any stats-related rules for v2 tables. Not all rules will be specific to reads. One option is to rename the current batch into something more generic but it would require changing quite some places. That's why it seems better to introduce a new batch and use it for all rewrites. The name is generic so that we don't limit ourselves to v2 data sources only. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? The change is trivial and SPARK-23889 will depend on it. Closes #30558 from aokolnychyi/spark-33612. Authored-by: Anton Okolnychyi Signed-off-by: Dongjoon Hyun --- .../spark/sql/catalyst/optimizer/Optimizer.scala | 9 +++++++++ .../spark/sql/internal/BaseSessionStateBuilder.scala | 11 +++++++++++ 2 files changed, 20 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 9eee7c2b914a4..b7c8f775b857f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -185,6 +185,9 @@ abstract class Optimizer(catalogManager: CatalogManager) RemoveLiteralFromGroupExpressions, RemoveRepetitionFromGroupExpressions) :: Nil ++ operatorOptimizationBatch) :+ + // This batch rewrites data source plans and should be run after the operator + // optimization batch and before any batches that depend on stats. + Batch("Data Source Rewrite Rules", Once, dataSourceRewriteRules: _*) :+ // This batch pushes filters and projections into scan nodes. Before this batch, the logical // plan may contain nodes that do not report stats. Anything that uses stats must run after // this batch. @@ -289,6 +292,12 @@ abstract class Optimizer(catalogManager: CatalogManager) */ def earlyScanPushDownRules: Seq[Rule[LogicalPlan]] = Nil + /** + * Override to provide additional rules for rewriting data source plans. Such rules will be + * applied after operator optimization rules and before any rules that depend on stats. + */ + def dataSourceRewriteRules: Seq[Rule[LogicalPlan]] = Nil + /** * Returns (defaultBatches - (excludedRules - nonExcludableRules)), the rule batches that * eventually run in the Optimizer. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala index a89a5de3b7e72..8101f9e291b44 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala @@ -241,6 +241,9 @@ abstract class BaseSessionStateBuilder( override def earlyScanPushDownRules: Seq[Rule[LogicalPlan]] = super.earlyScanPushDownRules ++ customEarlyScanPushDownRules + override def dataSourceRewriteRules: Seq[Rule[LogicalPlan]] = + super.dataSourceRewriteRules ++ customDataSourceRewriteRules + override def extendedOperatorOptimizationRules: Seq[Rule[LogicalPlan]] = super.extendedOperatorOptimizationRules ++ customOperatorOptimizationRules } @@ -264,6 +267,14 @@ abstract class BaseSessionStateBuilder( */ protected def customEarlyScanPushDownRules: Seq[Rule[LogicalPlan]] = Nil + /** + * Custom rules for rewriting data source plans to add to the Optimizer. Prefer overriding + * this instead of creating your own Optimizer. + * + * Note that this may NOT depend on the `optimizer` function. + */ + protected def customDataSourceRewriteRules: Seq[Rule[LogicalPlan]] = Nil + /** * Planner that converts optimized logical plans to physical plans. * From 5d0045eedf4b138c031accac2b1fa1e8d6f3f7c6 Mon Sep 17 00:00:00 2001 From: Gengliang Wang Date: Wed, 2 Dec 2020 01:36:41 +0800 Subject: [PATCH 069/150] [SPARK-33611][UI] Avoid encoding twice on the query parameter of rewritten proxy URL ### What changes were proposed in this pull request? When running Spark behind a reverse proxy(e.g. Nginx, Apache HTTP server), the request URL can be encoded twice if we pass the query string directly to the constructor of `java.net.URI`: ``` > val uri = "http://localhost:8081/test" > val query = "order%5B0%5D%5Bcolumn%5D=0" // query string of URL from the reverse proxy > val rewrittenURI = URI.create(uri.toString()) > new URI(rewrittenURI.getScheme(), rewrittenURI.getAuthority(), rewrittenURI.getPath(), query, rewrittenURI.getFragment()).toString result: http://localhost:8081/test?order%255B0%255D%255Bcolumn%255D=0 ``` In Spark's stage page, the URL of "/taskTable" contains query parameter order[0][dir]. After encoding twice, the query parameter becomes `order%255B0%255D%255Bdir%255D` and it will be decoded as `order%5B0%5D%5Bdir%5D` instead of `order[0][dir]`. As a result, there will be NullPointerException from https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala#L176 Other than that, the other parameter may not work as expected after encoded twice. This PR is to fix the bug by calling the method `URI.create(String URL)` directly. This convenience method can avoid encoding twice on the query parameter. ``` > val uri = "http://localhost:8081/test" > val query = "order%5B0%5D%5Bcolumn%5D=0" > URI.create(s"$uri?$query").toString result: http://localhost:8081/test?order%5B0%5D%5Bcolumn%5D=0 > URI.create(s"$uri?$query").getQuery result: order[0][column]=0 ``` ### Why are the changes needed? Fix a potential bug when Spark's reverse proxy is enabled. The bug itself is similar to https://github.com/apache/spark/pull/29271. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Add a new unit test. Also, Manual UI testing for master, worker and app UI with an nginx proxy Spark config: ``` spark.ui.port 8080 spark.ui.reverseProxy=true spark.ui.reverseProxyUrl=/path/to/spark/ ``` nginx config: ``` server { listen 9000; set $SPARK_MASTER http://127.0.0.1:8080; # split spark UI path into prefix and local path within master UI location ~ ^(/path/to/spark/) { # strip prefix when forwarding request rewrite /path/to/spark(/.*) $1 break; #rewrite /path/to/spark/ "/" ; # forward to spark master UI proxy_pass $SPARK_MASTER; proxy_intercept_errors on; error_page 301 302 307 = handle_redirects; } location handle_redirects { set $saved_redirect_location '$upstream_http_location'; proxy_pass $saved_redirect_location; } } ``` Closes #30552 from gengliangwang/decodeProxyRedirect. Authored-by: Gengliang Wang Signed-off-by: Gengliang Wang --- .../scala/org/apache/spark/ui/JettyUtils.scala | 16 ++++++---------- .../test/scala/org/apache/spark/ui/UISuite.scala | 9 +++++++++ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala index 2a3597e323543..663da0d33e20b 100644 --- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala @@ -401,17 +401,13 @@ private[spark] object JettyUtils extends Logging { uri.append(rest) } - val rewrittenURI = URI.create(uri.toString()) - if (query != null) { - return new URI( - rewrittenURI.getScheme(), - rewrittenURI.getAuthority(), - rewrittenURI.getPath(), - query, - rewrittenURI.getFragment() - ).normalize() + val queryString = if (query == null) { + "" + } else { + s"?$query" } - rewrittenURI.normalize() + // SPARK-33611: use method `URI.create` to avoid percent-encoding twice on the query string. + URI.create(uri.toString() + queryString).normalize() } def createProxyLocationHeader( diff --git a/core/src/test/scala/org/apache/spark/ui/UISuite.scala b/core/src/test/scala/org/apache/spark/ui/UISuite.scala index 56026eaa0072b..c7e1dfe71d563 100644 --- a/core/src/test/scala/org/apache/spark/ui/UISuite.scala +++ b/core/src/test/scala/org/apache/spark/ui/UISuite.scala @@ -216,6 +216,15 @@ class UISuite extends SparkFunSuite { assert(rewrittenURI === null) } + test("SPARK-33611: Avoid encoding twice on the query parameter of proxy rewrittenURI") { + val prefix = "/worker-id" + val target = "http://localhost:8081" + val path = "/worker-id/json" + val rewrittenURI = + JettyUtils.createProxyURI(prefix, target, path, "order%5B0%5D%5Bcolumn%5D=0") + assert(rewrittenURI.toString === "http://localhost:8081/json?order%5B0%5D%5Bcolumn%5D=0") + } + test("verify rewriting location header for reverse proxy") { val clientRequest = mock(classOf[HttpServletRequest]) var headerValue = "http://localhost:4040/jobs" From 5a1c5ac8073ab46c145146485c71cc6aceb8c5b8 Mon Sep 17 00:00:00 2001 From: zero323 Date: Tue, 1 Dec 2020 10:44:14 -0800 Subject: [PATCH 070/150] [SPARK-33622][R][ML] Add array_to_vector to SparkR ### What changes were proposed in this pull request? This PR adds `array_to_vector` to R API. ### Why are the changes needed? Feature parity. ### Does this PR introduce _any_ user-facing change? New function exposed in the public API. ### How was this patch tested? New unit test. Manual verification of the documentation examples. Closes #30561 from zero323/SPARK-33622. Authored-by: zero323 Signed-off-by: Dongjoon Hyun --- R/pkg/NAMESPACE | 1 + R/pkg/R/functions.R | 26 +++++++++++++++++++++++++- R/pkg/R/generics.R | 4 ++++ R/pkg/tests/fulltests/test_sparkSQL.R | 3 ++- 4 files changed, 32 insertions(+), 2 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 91f6e6dc8a0e6..6ef2df5731e10 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -223,6 +223,7 @@ exportMethods("%<=>%", "array_remove", "array_repeat", "array_sort", + "array_to_vector", "array_transform", "arrays_overlap", "array_union", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 99406443165d5..58d07a8d8fc2f 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -357,7 +357,13 @@ NULL #' @examples #' \dontrun{ #' df <- read.df("data/mllib/sample_libsvm_data.txt", source = "libsvm") -#' head(select(df, vector_to_array(df$features))) +#' head( +#' withColumn( +#' withColumn(df, "array", vector_to_array(df$features)), +#' "vector", +#' array_to_vector(column("array")) +#' ) +#' ) #' } NULL @@ -4609,6 +4615,24 @@ setMethod("timestamp_seconds", column(jc) }) +#' @details +#' \code{array_to_vector} Converts a column of array of numeric type into +#' a column of dense vectors in MLlib +#' +#' @rdname column_ml_functions +#' @aliases array_to_vector array_to_vector,Column-method +#' @note array_to_vector since 3.1.0 +setMethod("array_to_vector", + signature(x = "Column"), + function(x) { + jc <- callJStatic( + "org.apache.spark.ml.functions", + "array_to_vector", + x@jc + ) + column(jc) + }) + #' @details #' \code{vector_to_array} Converts a column of MLlib sparse/dense vectors into #' a column of dense arrays. diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 1fe6599bf1b97..fb830aa686f72 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -838,6 +838,10 @@ setGeneric("array_repeat", function(x, count) { standardGeneric("array_repeat") #' @name NULL setGeneric("array_sort", function(x) { standardGeneric("array_sort") }) +#' @rdname column_ml_functions +#' @name NULL +setGeneric("array_to_vector", function(x) { standardGeneric("array_to_vector") }) + #' @rdname column_collection_functions #' @name NULL setGeneric("array_transform", function(x, f) { standardGeneric("array_transform") }) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 833f77786c80b..c623f534f706c 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1425,7 +1425,8 @@ test_that("column functions", { c25 <- overlay(c1, c2, c3, c3) + overlay(c1, c2, c3) + overlay(c1, c2, 1) + overlay(c1, c2, 3, 4) c26 <- timestamp_seconds(c1) + vector_to_array(c) + - vector_to_array(c, "float32") + vector_to_array(c, "float64") + vector_to_array(c, "float32") + vector_to_array(c, "float64") + + array_to_vector(c) c27 <- nth_value("x", 1L) + nth_value("y", 2, TRUE) + nth_value(column("v"), 3) + nth_value(column("z"), 4L, FALSE) c28 <- asc_nulls_first(c1) + asc_nulls_last(c1) + From f71f34572d5510e50953ccd0191c833962b63a32 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Wed, 2 Dec 2020 09:50:02 +0900 Subject: [PATCH 071/150] [SPARK-33544][SQL] Optimize size of CreateArray/CreateMap to be the size of its children ### What changes were proposed in this pull request? https://issues.apache.org/jira/browse/SPARK-32295 added in an optimization to insert a filter for not null and size > 0 when using inner explode/inline. This is fine in most cases but the extra filter is not needed if the explode is with a create array and not using Literals (it already handles LIterals). When this happens you know that the values aren't null and it has a size. It already handles the empty array. The not null check is already optimized out because Createarray and createMap are not nullable, that leaves the size > 0 check. To handle that this PR makes it so that the size > 0 check gets optimized in ConstantFolding to be the size of the children in the array or map. That makes it a literal and then makes it ultimately be optimized out. ### Why are the changes needed? remove unneeded filter ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? Unit tests added and manually tested various cases Closes #30504 from tgravescs/SPARK-33544. Lead-authored-by: Thomas Graves Co-authored-by: Thomas Graves Co-authored-by: Hyukjin Kwon Signed-off-by: HyukjinKwon --- .../expressions/complexTypeCreator.scala | 12 ++++-- .../sql/catalyst/optimizer/expressions.scala | 13 ++++++ .../optimizer/ConstantFoldingSuite.scala | 36 ++++++++++++++++ .../InferFiltersFromGenerateSuite.scala | 41 ++++++++++++++++++- 4 files changed, 98 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index 3958cfd0af2a3..f0f92e2d935f1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -30,6 +30,12 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String +/** + * Trait to indicate the expression doesn't have any side effects. This can be used + * to indicate its ok to optimize it out under certain circumstances. + */ +trait NoSideEffect + /** * Returns an Array containing the evaluation of all children expressions. */ @@ -42,7 +48,7 @@ import org.apache.spark.unsafe.types.UTF8String """, since = "1.1.0") case class CreateArray(children: Seq[Expression], useStringTypeWhenEmpty: Boolean) - extends Expression { + extends Expression with NoSideEffect { def this(children: Seq[Expression]) = { this(children, SQLConf.get.getConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE)) @@ -160,7 +166,7 @@ private [sql] object GenArrayData { """, since = "2.0.0") case class CreateMap(children: Seq[Expression], useStringTypeWhenEmpty: Boolean) - extends Expression { + extends Expression with NoSideEffect{ def this(children: Seq[Expression]) = { this(children, SQLConf.get.getConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE)) @@ -379,7 +385,7 @@ object CreateStruct { """, since = "1.5.0") // scalastyle:on line.size.limit -case class CreateNamedStruct(children: Seq[Expression]) extends Expression { +case class CreateNamedStruct(children: Seq[Expression]) extends Expression with NoSideEffect { lazy val (nameExprs, valExprs) = children.grouped(2).map { case Seq(name, value) => (name, value) }.toList.unzip diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala index d1eb3b07d3d5f..4725f49340451 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala @@ -41,6 +41,14 @@ import org.apache.spark.sql.types._ * equivalent [[Literal]] values. */ object ConstantFolding extends Rule[LogicalPlan] { + + private def hasNoSideEffect(e: Expression): Boolean = e match { + case _: Attribute => true + case _: Literal => true + case _: NoSideEffect => e.children.forall(hasNoSideEffect) + case _ => false + } + def apply(plan: LogicalPlan): LogicalPlan = plan transform { case q: LogicalPlan => q transformExpressionsDown { // Skip redundant folding of literals. This rule is technically not necessary. Placing this @@ -48,6 +56,11 @@ object ConstantFolding extends Rule[LogicalPlan] { // object and running eval unnecessarily. case l: Literal => l + case Size(c: CreateArray, _) if c.children.forall(hasNoSideEffect) => + Literal(c.children.length) + case Size(c: CreateMap, _) if c.children.forall(hasNoSideEffect) => + Literal(c.children.length / 2) + // Fold expressions that are foldable. case e if e.foldable => Literal.create(e.eval(EmptyRow), e.dataType) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala index 23ab6b2df3e64..fd9b58a7a06aa 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala @@ -263,4 +263,40 @@ class ConstantFoldingSuite extends PlanTest { comparePlans(optimized, correctAnswer) } + + test("SPARK-33544: Constant folding test with sideaffects") { + val originalQuery = + testRelation + .select('a) + .where(Size(CreateArray(Seq(AssertTrue(false)))) > 0) + + val optimized = Optimize.execute(originalQuery.analyze) + comparePlans(optimized, originalQuery.analyze) + } + + object OptimizeForCreate extends RuleExecutor[LogicalPlan] { + val batches = + Batch("AnalysisNodes", Once, + EliminateSubqueryAliases) :: + Batch("ConstantFolding", FixedPoint(4), + OptimizeIn, + ConstantFolding, + PruneFilters) :: Nil + } + + test("SPARK-33544: Constant folding test CreateArray") { + val originalQuery = + testRelation + .select('a) + .where(Size(CreateArray(Seq('a))) > 0) + + val optimized = OptimizeForCreate.execute(originalQuery.analyze) + + val correctAnswer = + testRelation + .select('a) + .analyze + + comparePlans(optimized, correctAnswer) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromGenerateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromGenerateSuite.scala index 3f83971aa9821..c6fa1bd6e415c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromGenerateSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromGenerateSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.catalyst.optimizer +import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions._ @@ -33,7 +34,7 @@ class InferFiltersFromGenerateSuite extends PlanTest { val testRelation = LocalRelation('a.array(StructType(Seq( StructField("x", IntegerType), StructField("y", IntegerType) - )))) + ))), 'c1.string, 'c2.string) Seq(Explode(_), PosExplode(_), Inline(_)).foreach { f => val generator = f('a) @@ -72,4 +73,42 @@ class InferFiltersFromGenerateSuite extends PlanTest { comparePlans(optimized, originalQuery) } } + + // setup rules to test inferFilters with ConstantFolding to make sure + // the Filter rule added in inferFilters is removed again when doing + // explode with CreateArray/CreateMap + object OptimizeInferAndConstantFold extends RuleExecutor[LogicalPlan] { + val batches = + Batch("AnalysisNodes", Once, + EliminateSubqueryAliases) :: + Batch("Infer Filters", Once, InferFiltersFromGenerate) :: + Batch("ConstantFolding after", FixedPoint(4), + ConstantFolding, + NullPropagation, + PruneFilters) :: Nil + } + + Seq(Explode(_), PosExplode(_)).foreach { f => + val createArrayExplode = f(CreateArray(Seq('c1))) + test("Don't infer filters from CreateArray " + createArrayExplode) { + val originalQuery = testRelation.generate(createArrayExplode).analyze + val optimized = OptimizeInferAndConstantFold.execute(originalQuery) + comparePlans(optimized, originalQuery) + } + val createMapExplode = f(CreateMap(Seq('c1, 'c2))) + test("Don't infer filters from CreateMap " + createMapExplode) { + val originalQuery = testRelation.generate(createMapExplode).analyze + val optimized = OptimizeInferAndConstantFold.execute(originalQuery) + comparePlans(optimized, originalQuery) + } + } + + Seq(Inline(_)).foreach { f => + val createArrayStructExplode = f(CreateArray(Seq(CreateStruct(Seq('c1))))) + test("Don't infer filters from CreateArray " + createArrayStructExplode) { + val originalQuery = testRelation.generate(createArrayStructExplode).analyze + val optimized = OptimizeInferAndConstantFold.execute(originalQuery) + comparePlans(optimized, originalQuery) + } + } } From 51ebcd95a5f7e377245f302a91e90f9b3db9953e Mon Sep 17 00:00:00 2001 From: Cheng Su Date: Wed, 2 Dec 2020 10:17:00 +0900 Subject: [PATCH 072/150] [SPARK-32863][SS] Full outer stream-stream join ### What changes were proposed in this pull request? This PR is to add full outer stream-stream join, and the implementation of full outer join is: * For left side input row, check if there's a match on right side state store. * if there's a match, output the joined row, o.w. output nothing. Put the row in left side state store. * For right side input row, check if there's a match on left side state store. * if there's a match, output the joined row, o.w. output nothing. Put the row in right side state store. * State store eviction: evict rows from left/right side state store below watermark, and output rows never matched before (a combination of left outer and right outer join). ### Why are the changes needed? Enable more use cases for spark stream-stream join. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added unit tests in `UnsupportedOperationChecker.scala` and `StreamingJoinSuite.scala`. Closes #30395 from c21/stream-foj. Authored-by: Cheng Su Signed-off-by: Jungtaek Lim (HeartSaVioR) --- .../UnsupportedOperationChecker.scala | 71 +++--- .../analysis/UnsupportedOperationsSuite.scala | 16 +- .../StreamingSymmetricHashJoinExec.scala | 57 +++-- .../sql/streaming/StreamingJoinSuite.scala | 209 +++++++++++++++++- 4 files changed, 297 insertions(+), 56 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala index 7dcc6a81b48cd..ab7d90098bfd3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala @@ -287,7 +287,7 @@ object UnsupportedOperationChecker extends Logging { throwError("dropDuplicates is not supported after aggregation on a " + "streaming DataFrame/Dataset") - case Join(left, right, joinType, condition, _) => + case j @ Join(left, right, joinType, condition, _) => if (left.isStreaming && right.isStreaming && outputMode != InternalOutputModes.Append) { throwError("Join between two streaming DataFrames/Datasets is not supported" + s" in ${outputMode} output mode, only in Append output mode") @@ -298,8 +298,14 @@ object UnsupportedOperationChecker extends Logging { // no further validations needed case FullOuter => - if (left.isStreaming || right.isStreaming) { - throwError("Full outer joins with streaming DataFrames/Datasets are not supported") + if (left.isStreaming && !right.isStreaming) { + throwError("FullOuter joins with streaming DataFrames/Datasets on the left " + + "and a static DataFrame/Dataset on the right is not supported") + } else if (!left.isStreaming && right.isStreaming) { + throwError("FullOuter joins with streaming DataFrames/Datasets on the right " + + "and a static DataFrame/Dataset on the left is not supported") + } else if (left.isStreaming && right.isStreaming) { + checkForStreamStreamJoinWatermark(j) } case LeftAnti => @@ -315,40 +321,17 @@ object UnsupportedOperationChecker extends Logging { throwError(s"$joinType join with a streaming DataFrame/Dataset " + "on the right and a static DataFrame/Dataset on the left is not supported") } else if (left.isStreaming && right.isStreaming) { - val watermarkInJoinKeys = StreamingJoinHelper.isWatermarkInJoinKeys(subPlan) - - val hasValidWatermarkRange = - StreamingJoinHelper.getStateValueWatermark( - left.outputSet, right.outputSet, condition, Some(1000000)).isDefined - - if (!watermarkInJoinKeys && !hasValidWatermarkRange) { - throwError( - s"Stream-stream $joinType join between two streaming DataFrame/Datasets " + - "is not supported without a watermark in the join keys, or a watermark on " + - "the nullable side and an appropriate range condition") - } + checkForStreamStreamJoinWatermark(j) } // We support streaming right outer joins with static on the left always, and with // stream on both sides under the appropriate conditions. case RightOuter => if (left.isStreaming && !right.isStreaming) { - throwError("Right outer join with a streaming DataFrame/Dataset on the left and " + + throwError("RightOuter join with a streaming DataFrame/Dataset on the left and " + "a static DataFrame/DataSet on the right not supported") } else if (left.isStreaming && right.isStreaming) { - val isWatermarkInJoinKeys = StreamingJoinHelper.isWatermarkInJoinKeys(subPlan) - - // Check if the nullable side has a watermark, and there's a range condition which - // implies a state value watermark on the first side. - val hasValidWatermarkRange = - StreamingJoinHelper.getStateValueWatermark( - right.outputSet, left.outputSet, condition, Some(1000000)).isDefined - - if (!isWatermarkInJoinKeys && !hasValidWatermarkRange) { - throwError("Stream-stream outer join between two streaming DataFrame/Datasets " + - "is not supported without a watermark in the join keys, or a watermark on " + - "the nullable side and an appropriate range condition") - } + checkForStreamStreamJoinWatermark(j) } case NaturalJoin(_) | UsingJoin(_, _) => @@ -438,4 +421,34 @@ object UnsupportedOperationChecker extends Logging { throw new AnalysisException( msg, operator.origin.line, operator.origin.startPosition, Some(operator)) } + + private def checkForStreamStreamJoinWatermark(join: Join): Unit = { + val watermarkInJoinKeys = StreamingJoinHelper.isWatermarkInJoinKeys(join) + + // Check if the nullable side has a watermark, and there's a range condition which + // implies a state value watermark on the first side. + val hasValidWatermarkRange = join.joinType match { + case LeftOuter | LeftSemi => StreamingJoinHelper.getStateValueWatermark( + join.left.outputSet, join.right.outputSet, join.condition, Some(1000000)).isDefined + case RightOuter => StreamingJoinHelper.getStateValueWatermark( + join.right.outputSet, join.left.outputSet, join.condition, Some(1000000)).isDefined + case FullOuter => + Seq((join.left.outputSet, join.right.outputSet), + (join.right.outputSet, join.left.outputSet)).exists { + case (attributesToFindStateWatermarkFor, attributesWithEventWatermark) => + StreamingJoinHelper.getStateValueWatermark(attributesToFindStateWatermarkFor, + attributesWithEventWatermark, join.condition, Some(1000000)).isDefined + } + case _ => + throwError( + s"Join type ${join.joinType} is not supported with streaming DataFrame/Dataset")(join) + } + + if (!watermarkInJoinKeys && !hasValidWatermarkRange) { + throwError( + s"Stream-stream ${join.joinType} join between two streaming DataFrame/Datasets " + + "is not supported without a watermark in the join keys, or a watermark on " + + "the nullable side and an appropriate range condition")(join) + } + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala index 3be417de472c6..cdc3f4275414c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala @@ -408,13 +408,15 @@ class UnsupportedOperationsSuite extends SparkFunSuite with SQLHelper { streamStreamSupported = false, expectedMsg = "is not supported in Update output mode") - // Full outer joins: only batch-batch is allowed + // Full outer joins: stream-batch/batch-stream join are not allowed, + // and stream-stream join is allowed 'conditionally' - see below check testBinaryOperationInStreamingPlan( - "full outer join", + "FullOuter join", _.join(_, joinType = FullOuter), streamStreamSupported = false, batchStreamSupported = false, - streamBatchSupported = false) + streamBatchSupported = false, + expectedMsg = "FullOuter join") // Left outer, left semi, left anti join: *-stream not allowed Seq((LeftOuter, "LeftOuter join"), (LeftSemi, "LeftSemi join"), (LeftAnti, "LeftAnti join")) @@ -429,14 +431,14 @@ class UnsupportedOperationsSuite extends SparkFunSuite with SQLHelper { // Right outer joins: stream-* not allowed testBinaryOperationInStreamingPlan( - "right outer join", + "RightOuter join", _.join(_, joinType = RightOuter), streamBatchSupported = false, streamStreamSupported = false, - expectedMsg = "outer join") + expectedMsg = "RightOuter join") - // Left outer, right outer, left semi joins - Seq(LeftOuter, RightOuter, LeftSemi).foreach { joinType => + // Left outer, right outer, full outer, left semi joins + Seq(LeftOuter, RightOuter, FullOuter, LeftSemi).foreach { joinType => // Update mode not allowed assertNotSupportedInStreamingPlan( s"$joinType join with stream-stream relations and update mode", diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala index 8b69205530769..73d2f826f1126 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala @@ -165,8 +165,14 @@ case class StreamingSymmetricHashJoinExec( throw new IllegalArgumentException(errorMessageForJoinType) } + private def throwBadStateFormatVersionException(): Nothing = { + throw new IllegalStateException("Unexpected state format version! " + + s"version $stateFormatVersion") + } + require( - joinType == Inner || joinType == LeftOuter || joinType == RightOuter || joinType == LeftSemi, + joinType == Inner || joinType == LeftOuter || joinType == RightOuter || joinType == FullOuter || + joinType == LeftSemi, errorMessageForJoinType) require(leftKeys.map(_.dataType) == rightKeys.map(_.dataType)) @@ -186,6 +192,7 @@ case class StreamingSymmetricHashJoinExec( case _: InnerLike => left.output ++ right.output case LeftOuter => left.output ++ right.output.map(_.withNullability(true)) case RightOuter => left.output.map(_.withNullability(true)) ++ right.output + case FullOuter => (left.output ++ right.output).map(_.withNullability(true)) case LeftSemi => left.output case _ => throwBadJoinTypeException() } @@ -195,6 +202,7 @@ case class StreamingSymmetricHashJoinExec( PartitioningCollection(Seq(left.outputPartitioning, right.outputPartitioning)) case LeftOuter => left.outputPartitioning case RightOuter => right.outputPartitioning + case FullOuter => UnknownPartitioning(left.outputPartitioning.numPartitions) case LeftSemi => left.outputPartitioning case _ => throwBadJoinTypeException() } @@ -250,14 +258,14 @@ case class StreamingSymmetricHashJoinExec( // Join one side input using the other side's buffered/state rows. Here is how it is done. // // - `leftSideJoiner.storeAndJoinWithOtherSide(rightSideJoiner)` - // - Inner, Left Outer, Right Outer Join: generates all rows from matching new left input - // with stored right input, and also stores all the left input. + // - Inner, Left Outer, Right Outer, Full Outer Join: generates all rows from matching + // new left input with stored right input, and also stores all the left input. // - Left Semi Join: generates all new left input rows from matching new left input with // stored right input, and also stores all the non-matched left input. // // - `rightSideJoiner.storeAndJoinWithOtherSide(leftSideJoiner)` - // - Inner, Left Outer, Right Outer Join: generates all rows from matching new right input - // with stored left input, and also stores all the right input. + // - Inner, Left Outer, Right Outer, Full Outer Join: generates all rows from matching + // new right input with stored left input, and also stores all the right input. // It also generates all rows from matching new left input with new right input, since // the new left input has become stored by that point. This tiny asymmetry is necessary // to avoid duplication. @@ -314,9 +322,7 @@ case class StreamingSymmetricHashJoinExec( stateFormatVersion match { case 1 => matchesWithRightSideState(new UnsafeRowPair(kv.key, kv.value)) case 2 => kv.matched - case _ => - throw new IllegalStateException("Unexpected state format version! " + - s"version $stateFormatVersion") + case _ => throwBadStateFormatVersionException() } }.map(pair => joinedRow.withLeft(pair.value).withRight(nullRight)) @@ -333,13 +339,23 @@ case class StreamingSymmetricHashJoinExec( stateFormatVersion match { case 1 => matchesWithLeftSideState(new UnsafeRowPair(kv.key, kv.value)) case 2 => kv.matched - case _ => - throw new IllegalStateException("Unexpected state format version! " + - s"version $stateFormatVersion") + case _ => throwBadStateFormatVersionException() } }.map(pair => joinedRow.withLeft(nullLeft).withRight(pair.value)) hashJoinOutputIter ++ outerOutputIter + case FullOuter => + lazy val isKeyToValuePairMatched = (kv: KeyToValuePair) => + stateFormatVersion match { + case 2 => kv.matched + case _ => throwBadStateFormatVersionException() + } + val leftSideOutputIter = leftSideJoiner.removeOldState().filterNot( + isKeyToValuePairMatched).map(pair => joinedRow.withLeft(pair.value).withRight(nullRight)) + val rightSideOutputIter = rightSideJoiner.removeOldState().filterNot( + isKeyToValuePairMatched).map(pair => joinedRow.withLeft(nullLeft).withRight(pair.value)) + + hashJoinOutputIter ++ leftSideOutputIter ++ rightSideOutputIter case _ => throwBadJoinTypeException() } @@ -372,16 +388,21 @@ case class StreamingSymmetricHashJoinExec( // For inner and left semi joins, we have to remove unnecessary state rows from both sides // if possible. // - // For outer joins, we have already removed unnecessary state rows from the outer side - // (e.g., left side for left outer join) while generating the outer "null" outputs. Now, we - // have to remove unnecessary state rows from the other side (e.g., right side for the left - // outer join) if possible. In all cases, nothing needs to be outputted, hence the removal - // needs to be done greedily by immediately consuming the returned iterator. + // For left outer and right outer joins, we have already removed unnecessary state rows from + // the outer side (e.g., left side for left outer join) while generating the outer "null" + // outputs. Now, we have to remove unnecessary state rows from the other side (e.g., right + // side for the left outer join) if possible. In all cases, nothing needs to be outputted, + // hence the removal needs to be done greedily by immediately consuming the returned + // iterator. + // + // For full outer joins, we have already removed unnecessary states from both sides, so + // nothing needs to be outputted here. val cleanupIter = joinType match { case Inner | LeftSemi => leftSideJoiner.removeOldState() ++ rightSideJoiner.removeOldState() case LeftOuter => rightSideJoiner.removeOldState() case RightOuter => leftSideJoiner.removeOldState() + case FullOuter => Iterator.empty case _ => throwBadJoinTypeException() } while (cleanupIter.hasNext) { @@ -491,9 +512,9 @@ case class StreamingSymmetricHashJoinExec( } val generateFilteredJoinedRow: InternalRow => Iterator[InternalRow] = joinSide match { - case LeftSide if joinType == LeftOuter => + case LeftSide if joinType == LeftOuter || joinType == FullOuter => (row: InternalRow) => Iterator(generateJoinedRow(row, nullRight)) - case RightSide if joinType == RightOuter => + case RightSide if joinType == RightOuter || joinType == FullOuter => (row: InternalRow) => Iterator(generateJoinedRow(row, nullLeft)) case _ => (_: InternalRow) => Iterator.empty } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala index a25616af360b1..476abcbf5c241 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala @@ -99,7 +99,8 @@ abstract class StreamingJoinSuite } else if (joinType == "right_outer") { joined.select(right("key"), right("window.end").cast("long"), 'leftValue, 'rightValue) } else { - joined + joined.select(left("key"), left("window.end").cast("long"), 'leftValue, + right("key"), right("window.end").cast("long"), 'rightValue) } (leftInput, rightInput, select) @@ -128,7 +129,8 @@ abstract class StreamingJoinSuite } else if (joinType == "right_outer") { joined.select(right("key"), right("window.end").cast("long"), 'leftValue, 'rightValue) } else { - joined + joined.select(left("key"), left("window.end").cast("long"), 'leftValue, + right("key"), right("window.end").cast("long"), 'rightValue) } (leftInput, rightInput, select) @@ -1070,6 +1072,209 @@ class StreamingOuterJoinSuite extends StreamingJoinSuite { } } +class StreamingFullOuterJoinSuite extends StreamingJoinSuite { + + test("windowed full outer join") { + val (leftInput, rightInput, joined) = setupWindowedJoin("full_outer") + + testStream(joined)( + MultiAddData(leftInput, 1, 2, 3, 4, 5)(rightInput, 3, 4, 5, 6, 7), + CheckNewAnswer(Row(3, 10, 6, 9), Row(4, 10, 8, 12), Row(5, 10, 10, 15)), + // states + // left: 1, 2, 3, 4 ,5 + // right: 3, 4, 5, 6, 7 + assertNumStateRows(total = 10, updated = 10), + MultiAddData(leftInput, 21)(rightInput, 22), + // Watermark = 11, should remove rows having window=[0,10]. + CheckNewAnswer(Row(1, 10, 2, null), Row(2, 10, 4, null), Row(6, 10, null, 18), + Row(7, 10, null, 21)), + // states + // left: 21 + // right: 22 + // + // states evicted + // left: 1, 2, 3, 4 ,5 (below watermark) + // right: 3, 4, 5, 6, 7 (below watermark) + assertNumStateRows(total = 2, updated = 2), + AddData(leftInput, 22), + CheckNewAnswer(Row(22, 30, 44, 66)), + // states + // left: 21, 22 + // right: 22 + assertNumStateRows(total = 3, updated = 1), + StopStream, + StartStream(), + + AddData(leftInput, 1), + // Row not add as 1 < state key watermark = 12. + CheckNewAnswer(), + // states + // left: 21, 22 + // right: 22 + assertNumStateRows(total = 3, updated = 0, droppedByWatermark = 1), + AddData(rightInput, 5), + // Row not add as 5 < state key watermark = 12. + CheckNewAnswer(), + // states + // left: 21, 22 + // right: 22 + assertNumStateRows(total = 3, updated = 0, droppedByWatermark = 1) + ) + } + + test("full outer early state exclusion on left") { + val (leftInput, rightInput, joined) = setupWindowedJoinWithLeftCondition("full_outer") + + testStream(joined)( + MultiAddData(leftInput, 1, 2, 3)(rightInput, 3, 4, 5), + // The left rows with leftValue <= 4 should generate their outer join rows now and + // not get added to the state. + CheckNewAnswer(Row(1, 10, 2, null, null, null), Row(2, 10, 4, null, null, null), + Row(3, 10, 6, 3, 10, "9")), + // states + // left: 3 + // right: 3, 4, 5 + assertNumStateRows(total = 4, updated = 4), + // Generate outer join result for all non-matched rows when the watermark advances. + MultiAddData(leftInput, 20)(rightInput, 21), + CheckNewAnswer(Row(null, null, null, 4, 10, "12"), Row(null, null, null, 5, 10, "15")), + // states + // left: 20 + // right: 21 + // + // states evicted + // left: 3 (below watermark) + // right: 3, 4, 5 (below watermark) + assertNumStateRows(total = 2, updated = 2), + AddData(rightInput, 20), + CheckNewAnswer(Row(20, 30, 40, 20, 30, "60")), + // states + // left: 20 + // right: 21, 20 + assertNumStateRows(total = 3, updated = 1) + ) + } + + test("full outer early state exclusion on right") { + val (leftInput, rightInput, joined) = setupWindowedJoinWithRightCondition("full_outer") + + testStream(joined)( + MultiAddData(leftInput, 3, 4, 5)(rightInput, 1, 2, 3), + // The right rows with rightValue <= 7 should generate their outer join rows now, + // and never be added to the state. + // The right row with rightValue = 9 > 7, hence joined and added to state. + CheckNewAnswer(Row(null, null, null, 1, 10, "3"), Row(null, null, null, 2, 10, "6"), + Row(3, 10, 6, 3, 10, "9")), + // states + // left: 3, 4, 5 + // right: 3 + assertNumStateRows(total = 4, updated = 4), + // Generate outer join result for all non-matched rows when the watermark advances. + MultiAddData(leftInput, 20)(rightInput, 21), + CheckNewAnswer(Row(4, 10, 8, null, null, null), Row(5, 10, 10, null, null, null)), + // states + // left: 20 + // right: 21 + // + // states evicted + // left: 3, 4, 5 (below watermark) + // right: 3 (below watermark) + assertNumStateRows(total = 2, updated = 2), + AddData(rightInput, 20), + CheckNewAnswer(Row(20, 30, 40, 20, 30, "60")), + // states + // left: 20 + // right: 21, 20 + assertNumStateRows(total = 3, updated = 1) + ) + } + + test("full outer join with watermark range condition") { + val (leftInput, rightInput, joined) = setupWindowedJoinWithRangeCondition("full_outer") + + testStream(joined)( + AddData(leftInput, (1, 5), (3, 5)), + CheckNewAnswer(), + // states + // left: (1, 5), (3, 5) + // right: nothing + assertNumStateRows(total = 2, updated = 2), + AddData(rightInput, (1, 10), (2, 5)), + // Match left row in the state. + CheckNewAnswer(Row(1, 1, 5, 10)), + // states + // left: (1, 5), (3, 5) + // right: (1, 10), (2, 5) + assertNumStateRows(total = 4, updated = 2), + AddData(rightInput, (1, 9)), + // Match left row in the state. + CheckNewAnswer(Row(1, 1, 5, 9)), + // states + // left: (1, 5), (3, 5) + // right: (1, 10), (2, 5), (1, 9) + assertNumStateRows(total = 5, updated = 1), + // Increase event time watermark to 20s by adding data with time = 30s on both inputs. + AddData(leftInput, (1, 7), (1, 30)), + CheckNewAnswer(Row(1, 1, 7, 9), Row(1, 1, 7, 10)), + // states + // left: (1, 5), (3, 5), (1, 7), (1, 30) + // right: (1, 10), (2, 5), (1, 9) + assertNumStateRows(total = 7, updated = 2), + // Watermark = 30 - 10 = 20, no matched row. + // Generate outer join result for all non-matched rows when the watermark advances. + AddData(rightInput, (0, 30)), + CheckNewAnswer(Row(3, null, 5, null), Row(null, 2, null, 5)), + // states + // left: (1, 30) + // right: (0, 30) + // + // states evicted + // left: (1, 5), (3, 5), (1, 5) (below watermark = 20) + // right: (1, 10), (2, 5), (1, 9) (below watermark = 20) + assertNumStateRows(total = 2, updated = 1) + ) + } + + test("self full outer join") { + val (inputStream, query) = setupWindowedSelfJoin("full_outer") + + testStream(query)( + AddData(inputStream, (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L)), + CheckNewAnswer(Row(2, 2L, 2, 2L), Row(4, 4L, 4, 4L)), + // batch 1 - global watermark = 0 + // states + // left: (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L) + // right: (2, 2L), (4, 4L) + assertNumStateRows(total = 7, updated = 7), + AddData(inputStream, (6, 6L), (7, 7L), (8, 8L), (9, 9L), (10, 10L)), + CheckNewAnswer(Row(6, 6L, 6, 6L), Row(8, 8L, 8, 8L), Row(10, 10L, 10, 10L)), + // batch 2 - global watermark = 5 + // states + // left: (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L), (6, 6L), (7, 7L), (8, 8L), + // (9, 9L), (10, 10L) + // right: (6, 6L), (8, 8L), (10, 10L) + // + // states evicted + // left: nothing (it waits for 5 seconds more than watermark due to join condition) + // right: (2, 2L), (4, 4L) + assertNumStateRows(total = 13, updated = 8), + AddData(inputStream, (11, 11L), (12, 12L), (13, 13L), (14, 14L), (15, 15L)), + CheckNewAnswer(Row(12, 12L, 12, 12L), Row(14, 14L, 14, 14L), Row(1, 1L, null, null), + Row(3, 3L, null, null)), + // batch 3 - global watermark = 9 + // states + // left: (4, 4L), (5, 5L), (6, 6L), (7, 7L), (8, 8L), (9, 9L), (10, 10L), (11, 11L), + // (12, 12L), (13, 13L), (14, 14L), (15, 15L) + // right: (10, 10L), (12, 12L), (14, 14L) + // + // states evicted + // left: (1, 1L), (2, 2L), (3, 3L) + // right: (6, 6L), (8, 8L) + assertNumStateRows(total = 15, updated = 7) + ) + } +} + class StreamingLeftSemiJoinSuite extends StreamingJoinSuite { import testImplicits._ From a4788ee8c61e1373e6eded41bb57d84c68149968 Mon Sep 17 00:00:00 2001 From: Cheng Su Date: Wed, 2 Dec 2020 15:28:16 +0900 Subject: [PATCH 073/150] [MINOR][SS] Rename auxiliary protected methods in StreamingJoinSuite ### What changes were proposed in this pull request? Per request from https://github.com/apache/spark/pull/30395#issuecomment-735028698, here we remove `Windowed` from methods names `setupWindowedJoinWithRangeCondition` and `setupWindowedSelfJoin` as they don't join on time window. ### Why are the changes needed? There's no such official name for `windowed join`, so this is to help avoid confusion for future developers. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing unit tests. Closes #30563 from c21/stream-minor. Authored-by: Cheng Su Signed-off-by: Jungtaek Lim (HeartSaVioR) --- .../spark/sql/streaming/StreamingJoinSuite.scala | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala index 476abcbf5c241..d264886c8cf46 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala @@ -136,7 +136,7 @@ abstract class StreamingJoinSuite (leftInput, rightInput, select) } - protected def setupWindowedJoinWithRangeCondition(joinType: String) + protected def setupJoinWithRangeCondition(joinType: String) : (MemoryStream[(Int, Int)], MemoryStream[(Int, Int)], DataFrame) = { val leftInput = MemoryStream[(Int, Int)] @@ -167,7 +167,7 @@ abstract class StreamingJoinSuite (leftInput, rightInput, select) } - protected def setupWindowedSelfJoin(joinType: String) + protected def setupSelfJoin(joinType: String) : (MemoryStream[(Int, Long)], DataFrame) = { val inputStream = MemoryStream[(Int, Long)] @@ -750,7 +750,7 @@ class StreamingOuterJoinSuite extends StreamingJoinSuite { ("right_outer", Row(null, 2, null, 5)) ).foreach { case (joinType: String, outerResult) => test(s"${joinType.replaceAllLiterally("_", " ")} with watermark range condition") { - val (leftInput, rightInput, joined) = setupWindowedJoinWithRangeCondition(joinType) + val (leftInput, rightInput, joined) = setupJoinWithRangeCondition(joinType) testStream(joined)( AddData(leftInput, (1, 5), (3, 5)), @@ -830,7 +830,7 @@ class StreamingOuterJoinSuite extends StreamingJoinSuite { } test("SPARK-26187 self left outer join should not return outer nulls for already matched rows") { - val (inputStream, query) = setupWindowedSelfJoin("left_outer") + val (inputStream, query) = setupSelfJoin("left_outer") testStream(query)( AddData(inputStream, (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L)), @@ -1190,7 +1190,7 @@ class StreamingFullOuterJoinSuite extends StreamingJoinSuite { } test("full outer join with watermark range condition") { - val (leftInput, rightInput, joined) = setupWindowedJoinWithRangeCondition("full_outer") + val (leftInput, rightInput, joined) = setupJoinWithRangeCondition("full_outer") testStream(joined)( AddData(leftInput, (1, 5), (3, 5)), @@ -1236,7 +1236,7 @@ class StreamingFullOuterJoinSuite extends StreamingJoinSuite { } test("self full outer join") { - val (inputStream, query) = setupWindowedSelfJoin("full_outer") + val (inputStream, query) = setupSelfJoin("full_outer") testStream(query)( AddData(inputStream, (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L)), @@ -1394,7 +1394,7 @@ class StreamingLeftSemiJoinSuite extends StreamingJoinSuite { } test("left semi join with watermark range condition") { - val (leftInput, rightInput, joined) = setupWindowedJoinWithRangeCondition("left_semi") + val (leftInput, rightInput, joined) = setupJoinWithRangeCondition("left_semi") testStream(joined)( AddData(leftInput, (1, 5), (3, 5)), @@ -1439,7 +1439,7 @@ class StreamingLeftSemiJoinSuite extends StreamingJoinSuite { } test("self left semi join") { - val (inputStream, query) = setupWindowedSelfJoin("left_semi") + val (inputStream, query) = setupSelfJoin("left_semi") testStream(query)( AddData(inputStream, (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L)), From 290aa021796139e503454d315e5cd350f836ab42 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Wed, 2 Dec 2020 18:23:48 +0900 Subject: [PATCH 074/150] [SPARK-33618][CORE] Use hadoop-client instead of hadoop-client-api to make hadoop-aws work ### What changes were proposed in this pull request? This reverts commit SPARK-33212 (cb3fa6c9368e64184a5f7b19688181d11de9511c) mostly with three exceptions: 1. `SparkSubmitUtils` was updated recently by SPARK-33580 2. `resource-managers/yarn/pom.xml` was updated recently by SPARK-33104 to add `hadoop-yarn-server-resourcemanager` test dependency. 3. Adjust `com.fasterxml.jackson.module:jackson-module-jaxb-annotations` dependency in K8s module which is updated recently by SPARK-33471. ### Why are the changes needed? According to [HADOOP-16080](https://issues.apache.org/jira/browse/HADOOP-16080) since Apache Hadoop 3.1.1, `hadoop-aws` doesn't work with `hadoop-client-api`. It fails at write operation like the following. **1. Spark distribution with `-Phadoop-cloud`** ```scala $ bin/spark-shell --conf spark.hadoop.fs.s3a.access.key=$AWS_ACCESS_KEY_ID --conf spark.hadoop.fs.s3a.secret.key=$AWS_SECRET_ACCESS_KEY 20/11/30 23:01:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). Spark context available as 'sc' (master = local[*], app id = local-1606806088715). Spark session available as 'spark'. Welcome to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ /___/ .__/\_,_/_/ /_/\_\ version 3.1.0-SNAPSHOT /_/ Using Scala version 2.12.10 (OpenJDK 64-Bit Server VM, Java 1.8.0_272) Type in expressions to have them evaluated. Type :help for more information. scala> spark.read.parquet("s3a://dongjoon/users.parquet").show 20/11/30 23:01:34 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties +------+--------------+----------------+ | name|favorite_color|favorite_numbers| +------+--------------+----------------+ |Alyssa| null| [3, 9, 15, 20]| | Ben| red| []| +------+--------------+----------------+ scala> Seq(1).toDF.write.parquet("s3a://dongjoon/out.parquet") 20/11/30 23:02:14 ERROR Executor: Exception in task 0.0 in stage 2.0 (TID 2)/ 1] java.lang.NoSuchMethodError: org.apache.hadoop.util.SemaphoredDelegatingExecutor.(Lcom/google/common/util/concurrent/ListeningExecutorService;IZ)V ``` **2. Spark distribution without `-Phadoop-cloud`** ```scala $ bin/spark-shell --conf spark.hadoop.fs.s3a.access.key=$AWS_ACCESS_KEY_ID --conf spark.hadoop.fs.s3a.secret.key=$AWS_SECRET_ACCESS_KEY -c spark.eventLog.enabled=true -c spark.eventLog.dir=s3a://dongjoon/spark-events/ --packages org.apache.hadoop:hadoop-aws:3.2.0,org.apache.hadoop:hadoop-common:3.2.0 ... java.lang.NoSuchMethodError: org.apache.hadoop.util.SemaphoredDelegatingExecutor.(Lcom/google/common/util/concurrent/ListeningExecutorService;IZ)V at org.apache.hadoop.fs.s3a.S3AFileSystem.create(S3AFileSystem.java:772) ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass the CI. Closes #30508 from dongjoon-hyun/SPARK-33212-REVERT. Authored-by: Dongjoon Hyun Signed-off-by: HyukjinKwon --- common/network-yarn/pom.xml | 8 +-- core/pom.xml | 16 +---- dev/deps/spark-deps-hadoop-2.7-hive-2.3 | 3 +- dev/deps/spark-deps-hadoop-3.2-hive-2.3 | 52 +++++++++++++- external/kafka-0-10-assembly/pom.xml | 8 +-- external/kafka-0-10-sql/pom.xml | 4 -- external/kafka-0-10-token-provider/pom.xml | 5 -- external/kinesis-asl-assembly/pom.xml | 8 +-- hadoop-cloud/pom.xml | 7 +- launcher/pom.xml | 9 +-- pom.xml | 57 +++------------- resource-managers/kubernetes/core/pom.xml | 9 +++ resource-managers/yarn/pom.xml | 67 +++++++------------ .../spark/deploy/yarn/ApplicationMaster.scala | 6 +- .../deploy/yarn/BaseYarnClusterSuite.scala | 10 --- sql/catalyst/pom.xml | 4 -- sql/hive/pom.xml | 5 -- .../hive/client/IsolatedClientLoader.scala | 19 +----- 18 files changed, 107 insertions(+), 190 deletions(-) diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index 9938e5d769e12..0225db81925c5 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -65,13 +65,7 @@ org.apache.hadoop - ${hadoop-client-api.artifact} - ${hadoop.version} - - - org.apache.hadoop - ${hadoop-client-runtime.artifact} - ${hadoop.version} + hadoop-client org.slf4j diff --git a/core/pom.xml b/core/pom.xml index 9d2bf7dbe57a9..ce6f6ed9c7051 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -66,13 +66,7 @@ org.apache.hadoop - ${hadoop-client-api.artifact} - ${hadoop.version} - - - org.apache.hadoop - ${hadoop-client-runtime.artifact} - ${hadoop.version} + hadoop-client org.apache.spark @@ -183,14 +177,6 @@ org.apache.commons commons-text - - commons-io - commons-io - - - commons-collections - commons-collections - com.google.code.findbugs jsr305 diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 index 8802220726f78..a19558bc2a5e3 100644 --- a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 @@ -127,7 +127,7 @@ javax.inject/1//javax.inject-1.jar javax.jdo/3.2.0-m3//javax.jdo-3.2.0-m3.jar javax.servlet-api/3.1.0//javax.servlet-api-3.1.0.jar javolution/5.5.1//javolution-5.5.1.jar -jaxb-api/2.2.11//jaxb-api-2.2.11.jar +jaxb-api/2.2.2//jaxb-api-2.2.2.jar jaxb-runtime/2.3.2//jaxb-runtime-2.3.2.jar jcl-over-slf4j/1.7.30//jcl-over-slf4j-1.7.30.jar jdo-api/3.0.1//jdo-api-3.0.1.jar @@ -226,6 +226,7 @@ spire-macros_2.12/0.17.0-M1//spire-macros_2.12-0.17.0-M1.jar spire-platform_2.12/0.17.0-M1//spire-platform_2.12-0.17.0-M1.jar spire-util_2.12/0.17.0-M1//spire-util_2.12-0.17.0-M1.jar spire_2.12/0.17.0-M1//spire_2.12-0.17.0-M1.jar +stax-api/1.0-2//stax-api-1.0-2.jar stax-api/1.0.1//stax-api-1.0.1.jar stream/2.9.6//stream-2.9.6.jar super-csv/2.2.0//super-csv-2.2.0.jar diff --git a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 index d45eeea0ee92b..24283224dd37d 100644 --- a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 @@ -3,12 +3,14 @@ JLargeArrays/1.5//JLargeArrays-1.5.jar JTransforms/3.1//JTransforms-3.1.jar RoaringBitmap/0.9.0//RoaringBitmap-0.9.0.jar ST4/4.0.4//ST4-4.0.4.jar +accessors-smart/1.2//accessors-smart-1.2.jar activation/1.1.1//activation-1.1.1.jar aircompressor/0.10//aircompressor-0.10.jar algebra_2.12/2.0.0-M2//algebra_2.12-2.0.0-M2.jar antlr-runtime/3.5.2//antlr-runtime-3.5.2.jar antlr4-runtime/4.8-1//antlr4-runtime-4.8-1.jar aopalliance-repackaged/2.6.1//aopalliance-repackaged-2.6.1.jar +aopalliance/1.0//aopalliance-1.0.jar arpack_combined_all/0.1//arpack_combined_all-0.1.jar arrow-format/2.0.0//arrow-format-2.0.0.jar arrow-memory-core/2.0.0//arrow-memory-core-2.0.0.jar @@ -25,12 +27,15 @@ breeze_2.12/1.0//breeze_2.12-1.0.jar cats-kernel_2.12/2.0.0-M4//cats-kernel_2.12-2.0.0-M4.jar chill-java/0.9.5//chill-java-0.9.5.jar chill_2.12/0.9.5//chill_2.12-0.9.5.jar +commons-beanutils/1.9.4//commons-beanutils-1.9.4.jar commons-cli/1.2//commons-cli-1.2.jar commons-codec/1.10//commons-codec-1.10.jar commons-collections/3.2.2//commons-collections-3.2.2.jar commons-compiler/3.0.16//commons-compiler-3.0.16.jar commons-compress/1.20//commons-compress-1.20.jar +commons-configuration2/2.1.1//commons-configuration2-2.1.1.jar commons-crypto/1.1.0//commons-crypto-1.1.0.jar +commons-daemon/1.0.13//commons-daemon-1.0.13.jar commons-dbcp/1.4//commons-dbcp-1.4.jar commons-httpclient/3.1//commons-httpclient-3.1.jar commons-io/2.5//commons-io-2.5.jar @@ -50,13 +55,30 @@ datanucleus-api-jdo/4.2.4//datanucleus-api-jdo-4.2.4.jar datanucleus-core/4.1.17//datanucleus-core-4.1.17.jar datanucleus-rdbms/4.1.19//datanucleus-rdbms-4.1.19.jar derby/10.12.1.1//derby-10.12.1.1.jar +dnsjava/2.1.7//dnsjava-2.1.7.jar dropwizard-metrics-hadoop-metrics2-reporter/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +ehcache/3.3.1//ehcache-3.3.1.jar flatbuffers-java/1.9.0//flatbuffers-java-1.9.0.jar generex/1.0.2//generex-1.0.2.jar +geronimo-jcache_1.0_spec/1.0-alpha-1//geronimo-jcache_1.0_spec-1.0-alpha-1.jar gson/2.2.4//gson-2.2.4.jar guava/14.0.1//guava-14.0.1.jar -hadoop-client-api/3.2.0//hadoop-client-api-3.2.0.jar -hadoop-client-runtime/3.2.0//hadoop-client-runtime-3.2.0.jar +guice-servlet/4.0//guice-servlet-4.0.jar +guice/4.0//guice-4.0.jar +hadoop-annotations/3.2.0//hadoop-annotations-3.2.0.jar +hadoop-auth/3.2.0//hadoop-auth-3.2.0.jar +hadoop-client/3.2.0//hadoop-client-3.2.0.jar +hadoop-common/3.2.0//hadoop-common-3.2.0.jar +hadoop-hdfs-client/3.2.0//hadoop-hdfs-client-3.2.0.jar +hadoop-mapreduce-client-common/3.2.0//hadoop-mapreduce-client-common-3.2.0.jar +hadoop-mapreduce-client-core/3.2.0//hadoop-mapreduce-client-core-3.2.0.jar +hadoop-mapreduce-client-jobclient/3.2.0//hadoop-mapreduce-client-jobclient-3.2.0.jar +hadoop-yarn-api/3.2.0//hadoop-yarn-api-3.2.0.jar +hadoop-yarn-client/3.2.0//hadoop-yarn-client-3.2.0.jar +hadoop-yarn-common/3.2.0//hadoop-yarn-common-3.2.0.jar +hadoop-yarn-registry/3.2.0//hadoop-yarn-registry-3.2.0.jar +hadoop-yarn-server-common/3.2.0//hadoop-yarn-server-common-3.2.0.jar +hadoop-yarn-server-web-proxy/3.2.0//hadoop-yarn-server-web-proxy-3.2.0.jar hive-beeline/2.3.7//hive-beeline-2.3.7.jar hive-cli/2.3.7//hive-cli-2.3.7.jar hive-common/2.3.7//hive-common-2.3.7.jar @@ -86,6 +108,8 @@ jackson-core/2.10.0//jackson-core-2.10.0.jar jackson-databind/2.10.0//jackson-databind-2.10.0.jar jackson-dataformat-yaml/2.10.0//jackson-dataformat-yaml-2.10.0.jar jackson-datatype-jsr310/2.11.2//jackson-datatype-jsr310-2.11.2.jar +jackson-jaxrs-base/2.9.5//jackson-jaxrs-base-2.9.5.jar +jackson-jaxrs-json-provider/2.9.5//jackson-jaxrs-json-provider-2.9.5.jar jackson-mapper-asl/1.9.13//jackson-mapper-asl-1.9.13.jar jackson-module-jaxb-annotations/2.10.0//jackson-module-jaxb-annotations-2.10.0.jar jackson-module-paranamer/2.10.0//jackson-module-paranamer-2.10.0.jar @@ -98,11 +122,13 @@ jakarta.ws.rs-api/2.1.6//jakarta.ws.rs-api-2.1.6.jar jakarta.xml.bind-api/2.3.2//jakarta.xml.bind-api-2.3.2.jar janino/3.0.16//janino-3.0.16.jar javassist/3.25.0-GA//javassist-3.25.0-GA.jar +javax.inject/1//javax.inject-1.jar javax.jdo/3.2.0-m3//javax.jdo-3.2.0-m3.jar javax.servlet-api/3.1.0//javax.servlet-api-3.1.0.jar javolution/5.5.1//javolution-5.5.1.jar jaxb-api/2.2.11//jaxb-api-2.2.11.jar jaxb-runtime/2.3.2//jaxb-runtime-2.3.2.jar +jcip-annotations/1.0-1//jcip-annotations-1.0-1.jar jcl-over-slf4j/1.7.30//jcl-over-slf4j-1.7.30.jar jdo-api/3.0.1//jdo-api-3.0.1.jar jersey-client/2.30//jersey-client-2.30.jar @@ -116,14 +142,30 @@ jline/2.14.6//jline-2.14.6.jar joda-time/2.10.5//joda-time-2.10.5.jar jodd-core/3.5.2//jodd-core-3.5.2.jar jpam/1.1//jpam-1.1.jar +json-smart/2.3//json-smart-2.3.jar json/1.8//json-1.8.jar json4s-ast_2.12/3.7.0-M5//json4s-ast_2.12-3.7.0-M5.jar json4s-core_2.12/3.7.0-M5//json4s-core_2.12-3.7.0-M5.jar json4s-jackson_2.12/3.7.0-M5//json4s-jackson_2.12-3.7.0-M5.jar json4s-scalap_2.12/3.7.0-M5//json4s-scalap_2.12-3.7.0-M5.jar +jsp-api/2.1//jsp-api-2.1.jar jsr305/3.0.0//jsr305-3.0.0.jar jta/1.1//jta-1.1.jar jul-to-slf4j/1.7.30//jul-to-slf4j-1.7.30.jar +kerb-admin/1.0.1//kerb-admin-1.0.1.jar +kerb-client/1.0.1//kerb-client-1.0.1.jar +kerb-common/1.0.1//kerb-common-1.0.1.jar +kerb-core/1.0.1//kerb-core-1.0.1.jar +kerb-crypto/1.0.1//kerb-crypto-1.0.1.jar +kerb-identity/1.0.1//kerb-identity-1.0.1.jar +kerb-server/1.0.1//kerb-server-1.0.1.jar +kerb-simplekdc/1.0.1//kerb-simplekdc-1.0.1.jar +kerb-util/1.0.1//kerb-util-1.0.1.jar +kerby-asn1/1.0.1//kerby-asn1-1.0.1.jar +kerby-config/1.0.1//kerby-config-1.0.1.jar +kerby-pkix/1.0.1//kerby-pkix-1.0.1.jar +kerby-util/1.0.1//kerby-util-1.0.1.jar +kerby-xdr/1.0.1//kerby-xdr-1.0.1.jar kryo-shaded/4.0.2//kryo-shaded-4.0.2.jar kubernetes-client/4.12.0//kubernetes-client-4.12.0.jar kubernetes-model-admissionregistration/4.12.0//kubernetes-model-admissionregistration-4.12.0.jar @@ -161,7 +203,9 @@ metrics-json/4.1.1//metrics-json-4.1.1.jar metrics-jvm/4.1.1//metrics-jvm-4.1.1.jar minlog/1.3.0//minlog-1.3.0.jar netty-all/4.1.51.Final//netty-all-4.1.51.Final.jar +nimbus-jose-jwt/4.41.1//nimbus-jose-jwt-4.41.1.jar objenesis/2.6//objenesis-2.6.jar +okhttp/2.7.5//okhttp-2.7.5.jar okhttp/3.12.12//okhttp-3.12.12.jar okio/1.14.0//okio-1.14.0.jar opencsv/2.3//opencsv-2.3.jar @@ -180,6 +224,7 @@ parquet-jackson/1.10.1//parquet-jackson-1.10.1.jar protobuf-java/2.5.0//protobuf-java-2.5.0.jar py4j/0.10.9//py4j-0.10.9.jar pyrolite/4.30//pyrolite-4.30.jar +re2j/1.1//re2j-1.1.jar scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar scala-compiler/2.12.10//scala-compiler-2.12.10.jar scala-library/2.12.10//scala-library-2.12.10.jar @@ -197,12 +242,15 @@ spire-platform_2.12/0.17.0-M1//spire-platform_2.12-0.17.0-M1.jar spire-util_2.12/0.17.0-M1//spire-util_2.12-0.17.0-M1.jar spire_2.12/0.17.0-M1//spire_2.12-0.17.0-M1.jar stax-api/1.0.1//stax-api-1.0.1.jar +stax2-api/3.1.4//stax2-api-3.1.4.jar stream/2.9.6//stream-2.9.6.jar super-csv/2.2.0//super-csv-2.2.0.jar threeten-extra/1.5.0//threeten-extra-1.5.0.jar +token-provider/1.0.1//token-provider-1.0.1.jar transaction-api/1.1//transaction-api-1.1.jar univocity-parsers/2.9.0//univocity-parsers-2.9.0.jar velocity/1.5//velocity-1.5.jar +woodstox-core/5.0.3//woodstox-core-5.0.3.jar xbean-asm7-shaded/4.15//xbean-asm7-shaded-4.15.jar xz/1.5//xz-1.5.jar zjsonpatch/0.3.0//zjsonpatch-0.3.0.jar diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml index b1e306c499385..d9d9fb7f55c77 100644 --- a/external/kafka-0-10-assembly/pom.xml +++ b/external/kafka-0-10-assembly/pom.xml @@ -71,15 +71,9 @@ org.apache.hadoop - ${hadoop-client-api.artifact} - ${hadoop.version} + hadoop-client provided - - org.apache.hadoop - ${hadoop-client-runtime.artifact} - ${hadoop.version} - org.apache.avro avro-mapred diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml index 06a6bef005e69..95a99ac88412e 100644 --- a/external/kafka-0-10-sql/pom.xml +++ b/external/kafka-0-10-sql/pom.xml @@ -79,10 +79,6 @@ kafka-clients ${kafka.version} - - com.google.code.findbugs - jsr305 - org.apache.commons commons-pool2 diff --git a/external/kafka-0-10-token-provider/pom.xml b/external/kafka-0-10-token-provider/pom.xml index 1b0d6d322917f..941946f30e96f 100644 --- a/external/kafka-0-10-token-provider/pom.xml +++ b/external/kafka-0-10-token-provider/pom.xml @@ -58,11 +58,6 @@ mockito-core test - - org.apache.hadoop - ${hadoop-client-runtime.artifact} - ${hadoop.deps.scope} - org.apache.spark spark-tags_${scala.binary.version} diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml index 5a49358a84241..76ee5bb7b2f85 100644 --- a/external/kinesis-asl-assembly/pom.xml +++ b/external/kinesis-asl-assembly/pom.xml @@ -91,15 +91,9 @@ org.apache.hadoop - ${hadoop-client-api.artifact} - ${hadoop.version} + hadoop-client provided - - org.apache.hadoop - ${hadoop-client-runtime.artifact} - ${hadoop.version} - org.apache.avro avro-ipc diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml index a5642a5a68fe4..8689e0b8a9ea8 100644 --- a/hadoop-cloud/pom.xml +++ b/hadoop-cloud/pom.xml @@ -58,15 +58,10 @@ org.apache.hadoop - ${hadoop-client-api.artifact} + hadoop-client ${hadoop.version} provided - - org.apache.hadoop - ${hadoop-client-runtime.artifact} - ${hadoop.version} - org.apache.hadoop - ${hadoop-client-api.artifact} - ${hadoop.version} - test - - - org.apache.hadoop - ${hadoop-client-runtime.artifact} - ${hadoop.version} + hadoop-client test diff --git a/pom.xml b/pom.xml index f0ad9b0167c32..4d6e3bbc95378 100644 --- a/pom.xml +++ b/pom.xml @@ -244,15 +244,6 @@ compile test - - hadoop-client-api - hadoop-client-runtime - hadoop-client-minicluster - - - org.apache.hadoop - hadoop-client-api - ${hadoop.version} - ${hadoop.deps.scope} - - - org.apache.hadoop - hadoop-client-runtime - ${hadoop.version} - ${hadoop.deps.scope} - - - org.apache.hadoop - hadoop-client-minicluster - ${yarn.version} - test - - org.apache.hadoop hadoop-client @@ -1688,14 +1654,6 @@ org.apache.ant ant
      - - org.apache.hadoop - hadoop-common - - - org.apache.hadoop - hadoop-auth - org.apache.zookeeper zookeeper @@ -2460,6 +2418,17 @@ + + enforce-no-duplicate-dependencies + + enforce + + + + + + + @@ -2919,7 +2888,6 @@ maven-shade-plugin false - false org.spark-project.spark:unused @@ -3181,9 +3149,6 @@ 2.7.4 2.7.1 2.4 - hadoop-client - hadoop-client - hadoop-client diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml index edeb95fdba684..18e1c65e2e932 100644 --- a/resource-managers/kubernetes/core/pom.xml +++ b/resource-managers/kubernetes/core/pom.xml @@ -63,6 +63,10 @@ com.fasterxml.jackson.core * + + com.fasterxml.jackson.module + jackson-module-jaxb-annotations + com.fasterxml.jackson.dataformat jackson-dataformat-yaml @@ -81,6 +85,11 @@ jackson-dataformat-yaml ${fasterxml.jackson.version} + + com.fasterxml.jackson.module + jackson-module-jaxb-annotations + ${fasterxml.jackson.version} + diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml index f6d6ddccc99c3..e9122ce202723 100644 --- a/resource-managers/yarn/pom.xml +++ b/resource-managers/yarn/pom.xml @@ -40,42 +40,6 @@ true - - hadoop-2.7 - - - org.apache.hadoop - hadoop-yarn-api - - - org.apache.hadoop - hadoop-yarn-common - - - org.apache.hadoop - hadoop-yarn-server-web-proxy - - - org.apache.hadoop - hadoop-yarn-client - - - org.apache.hadoop - hadoop-yarn-server-tests - tests - test - - - - org.apache.hadoop - hadoop-yarn-server-resourcemanager - test - - - @@ -105,20 +69,23 @@ org.apache.hadoop - ${hadoop-client-api.artifact} - ${hadoop.version} + hadoop-yarn-api org.apache.hadoop - ${hadoop-client-runtime.artifact} - ${hadoop.version} - ${hadoop.deps.scope} + hadoop-yarn-common org.apache.hadoop - ${hadoop-client-minicluster.artifact} - ${hadoop.version} - test + hadoop-yarn-server-web-proxy + + + org.apache.hadoop + hadoop-yarn-client + + + org.apache.hadoop + hadoop-client @@ -175,6 +142,18 @@ test + + org.apache.hadoop + hadoop-yarn-server-tests + tests + test + + + org.apache.hadoop + hadoop-yarn-server-resourcemanager + test + + org.mockito mockito-core diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala index e23773229c560..be9a88ca9b1d6 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala @@ -19,7 +19,7 @@ package org.apache.spark.deploy.yarn import java.io.{File, IOException} import java.lang.reflect.{InvocationTargetException, Modifier} -import java.net.{URI, URL, URLEncoder} +import java.net.{URI, URL} import java.security.PrivilegedExceptionAction import java.util.concurrent.{TimeoutException, TimeUnit} @@ -36,6 +36,7 @@ import org.apache.hadoop.yarn.api._ import org.apache.hadoop.yarn.api.records._ import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.hadoop.yarn.exceptions.ApplicationAttemptNotFoundException +import org.apache.hadoop.yarn.server.webproxy.ProxyUriUtils import org.apache.hadoop.yarn.util.{ConverterUtils, Records} import org.apache.spark._ @@ -307,8 +308,7 @@ private[spark] class ApplicationMaster( // The client-mode AM doesn't listen for incoming connections, so report an invalid port. registerAM(Utils.localHostName, -1, sparkConf, sparkConf.getOption("spark.driver.appUIAddress"), appAttemptId) - val encodedAppId = URLEncoder.encode(appAttemptId.getApplicationId.toString, "UTF-8") - addAmIpFilter(Some(driverRef), s"/proxy/$encodedAppId") + addAmIpFilter(Some(driverRef), ProxyUriUtils.getPath(appAttemptId.getApplicationId)) createAllocator(driverRef, sparkConf, clientRpcEnv, appAttemptId, cachedResourcesConf) reporterThread.join() } catch { diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala index a813b9913f23b..20f5339c46fef 100644 --- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala +++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala @@ -80,16 +80,6 @@ abstract class BaseYarnClusterSuite yarnConf.set("yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage", "100.0") - // capacity-scheduler.xml is missing in hadoop-client-minicluster so this is a workaround - yarnConf.set("yarn.scheduler.capacity.root.queues", "default") - yarnConf.setInt("yarn.scheduler.capacity.root.default.capacity", 100) - yarnConf.setFloat("yarn.scheduler.capacity.root.default.user-limit-factor", 1) - yarnConf.setInt("yarn.scheduler.capacity.root.default.maximum-capacity", 100) - yarnConf.set("yarn.scheduler.capacity.root.default.state", "RUNNING") - yarnConf.set("yarn.scheduler.capacity.root.default.acl_submit_applications", "*") - yarnConf.set("yarn.scheduler.capacity.root.default.acl_administer_queue", "*") - yarnConf.setInt("yarn.scheduler.capacity.node-locality-delay", -1) - yarnCluster = new MiniYARNCluster(getClass().getName(), 1, 1, 1) yarnCluster.init(yarnConf) yarnCluster.start() diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml index af976fa1fa983..6b79eb722fcdd 100644 --- a/sql/catalyst/pom.xml +++ b/sql/catalyst/pom.xml @@ -104,10 +104,6 @@ org.antlr antlr4-runtime - - javax.xml.bind - jaxb-api - commons-codec commons-codec diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index 4fca6264c0594..0453094cf8b7b 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -162,11 +162,6 @@ org.datanucleus datanucleus-core - - org.apache.hadoop - ${hadoop-client-runtime.artifact} - ${hadoop.deps.scope} - org.apache.thrift libthrift diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala index 9663e03ee6a74..c0758dcdfc879 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala @@ -112,24 +112,11 @@ private[hive] object IsolatedClientLoader extends Logging { hadoopVersion: String, ivyPath: Option[String], remoteRepos: String): Seq[URL] = { - val hadoopJarNames = if (hadoopVersion.startsWith("3")) { - Seq(s"org.apache.hadoop:hadoop-client-api:$hadoopVersion", - s"org.apache.hadoop:hadoop-client-runtime:$hadoopVersion") - } else { - Seq(s"org.apache.hadoop:hadoop-client:$hadoopVersion") - } val hiveArtifacts = version.extraDeps ++ Seq("hive-metastore", "hive-exec", "hive-common", "hive-serde") .map(a => s"org.apache.hive:$a:${version.fullVersion}") ++ - Seq("com.google.guava:guava:14.0.1") ++ hadoopJarNames - - val extraExclusions = if (hadoopVersion.startsWith("3")) { - // this introduced from lower version of Hive could conflict with jars in Hadoop 3.2+, so - // exclude here in favor of the ones in Hadoop 3.2+ - Seq("org.apache.hadoop:hadoop-auth") - } else { - Seq.empty - } + Seq("com.google.guava:guava:14.0.1", + s"org.apache.hadoop:hadoop-client:$hadoopVersion") val classpath = quietly { SparkSubmitUtils.resolveMavenCoordinates( @@ -137,7 +124,7 @@ private[hive] object IsolatedClientLoader extends Logging { SparkSubmitUtils.buildIvySettings( Some(remoteRepos), ivyPath), - exclusions = version.exclusions ++ extraExclusions) + exclusions = version.exclusions) } val allFiles = classpath.split(",").map(new File(_)).toSet From 084d38b64ecbcaa9fac47ffca5604cf2a72936fc Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Wed, 2 Dec 2020 18:41:49 +0900 Subject: [PATCH 075/150] [SPARK-33557][CORE][MESOS][TEST] Ensure the relationship between STORAGE_BLOCKMANAGER_HEARTBEAT_TIMEOUT and NETWORK_TIMEOUT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes were proposed in this pull request? As described in SPARK-33557, `HeartbeatReceiver` and `MesosCoarseGrainedSchedulerBackend` will always use `Network.NETWORK_TIMEOUT.defaultValueString` as value of `STORAGE_BLOCKMANAGER_HEARTBEAT_TIMEOUT` when we configure `NETWORK_TIMEOUT` without configure `STORAGE_BLOCKMANAGER_HEARTBEAT_TIMEOUT`, this is different from the relationship described in `configuration.md`. To fix this problem,the main change of this pr as follow: - Remove the explicitly default value of `STORAGE_BLOCKMANAGER_HEARTBEAT_TIMEOUT` - Use actual value of `NETWORK_TIMEOUT` as `STORAGE_BLOCKMANAGER_HEARTBEAT_TIMEOUT` when `STORAGE_BLOCKMANAGER_HEARTBEAT_TIMEOUT` not configured in `HeartbeatReceiver` and `MesosCoarseGrainedSchedulerBackend` ### Why are the changes needed? To ensure the relationship between `NETWORK_TIMEOUT` and `STORAGE_BLOCKMANAGER_HEARTBEAT_TIMEOUT` as we described in `configuration.md` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - Pass the Jenkins or GitHub Action - Manual test configure `NETWORK_TIMEOUT` and `STORAGE_BLOCKMANAGER_HEARTBEAT_TIMEOUT` locally Closes #30547 from LuciferYang/SPARK-33557. Authored-by: yangjie01 Signed-off-by: HyukjinKwon --- core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala | 4 +++- .../scala/org/apache/spark/internal/config/package.scala | 2 +- .../org/apache/spark/repl/ExecutorClassLoaderSuite.scala | 1 + .../cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala | 5 ++++- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala index 233ad884a721a..13ff075660cd7 100644 --- a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala +++ b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala @@ -80,7 +80,9 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock) // executor ID -> timestamp of when the last heartbeat from this executor was received private val executorLastSeen = new HashMap[String, Long] - private val executorTimeoutMs = sc.conf.get(config.STORAGE_BLOCKMANAGER_HEARTBEAT_TIMEOUT) + private val executorTimeoutMs = sc.conf.get( + config.STORAGE_BLOCKMANAGER_HEARTBEAT_TIMEOUT + ).getOrElse(Utils.timeStringAsMs(s"${sc.conf.get(Network.NETWORK_TIMEOUT)}s")) private val checkTimeoutIntervalMs = sc.conf.get(Network.NETWORK_TIMEOUT_INTERVAL) diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index 6639f20a068d4..f6de5e4128ca5 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -504,7 +504,7 @@ package object config { .version("0.7.0") .withAlternative("spark.storage.blockManagerSlaveTimeoutMs") .timeConf(TimeUnit.MILLISECONDS) - .createWithDefaultString(Network.NETWORK_TIMEOUT.defaultValueString) + .createOptional private[spark] val STORAGE_CLEANUP_FILES_AFTER_EXECUTOR_EXIT = ConfigBuilder("spark.storage.cleanupFilesAfterExecutorExit") diff --git a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala index 386de19e919e6..23ea3fee2505b 100644 --- a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala +++ b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala @@ -231,6 +231,7 @@ class ExecutorClassLoaderSuite .setMaster("local") .setAppName("executor-class-loader-test") .set("spark.network.timeout", "11s") + .set("spark.network.timeoutInterval", "11s") .set("spark.repl.class.outputDir", tempDir1.getAbsolutePath) val sc = new SparkContext(conf) try { diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index efcef09132f5b..6fedce61d8208 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -34,6 +34,7 @@ import org.apache.spark.{SecurityManager, SparkConf, SparkContext, SparkExceptio import org.apache.spark.deploy.mesos.config._ import org.apache.spark.deploy.security.HadoopDelegationTokenManager import org.apache.spark.internal.config +import org.apache.spark.internal.config.Network import org.apache.spark.internal.config.Tests.IS_TESTING import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle} import org.apache.spark.network.netty.SparkTransportConf @@ -651,7 +652,9 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( .registerDriverWithShuffleService( agent.hostname, externalShufflePort, - sc.conf.get(config.STORAGE_BLOCKMANAGER_HEARTBEAT_TIMEOUT), + sc.conf.get( + config.STORAGE_BLOCKMANAGER_HEARTBEAT_TIMEOUT + ).getOrElse(Utils.timeStringAsMs(s"${sc.conf.get(Network.NETWORK_TIMEOUT)}s")), sc.conf.get(config.EXECUTOR_HEARTBEAT_INTERVAL)) agent.shuffleRegistered = true } From 28dad1ba770e5b7f7cf542da1ae3f05975a969c6 Mon Sep 17 00:00:00 2001 From: neko Date: Wed, 2 Dec 2020 09:24:19 -0600 Subject: [PATCH 076/150] [SPARK-33504][CORE] The application log in the Spark history server contains sensitive attributes should be redacted ### What changes were proposed in this pull request? To make sure the sensitive attributes to be redacted in the history server log. ### Why are the changes needed? We found the secure attributes like password in SparkListenerJobStart and SparkListenerStageSubmitted events would not been redated, resulting in sensitive attributes can be viewd directly. The screenshot can be viewed in the attachment of JIRA spark-33504 ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? muntual test works well, I have also added unit testcase. Closes #30446 from akiyamaneko/eventlog_unredact. Authored-by: neko Signed-off-by: Thomas Graves --- .../scheduler/EventLoggingListener.scala | 24 ++++++- .../scheduler/EventLoggingListenerSuite.scala | 64 ++++++++++++++++++- 2 files changed, 85 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala index 1fda03f732636..d4e22d739098f 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala @@ -18,7 +18,9 @@ package org.apache.spark.scheduler import java.net.URI +import java.util.Properties +import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.hadoop.conf.Configuration @@ -103,7 +105,7 @@ private[spark] class EventLoggingListener( // Events that do not trigger a flush override def onStageSubmitted(event: SparkListenerStageSubmitted): Unit = { - logEvent(event) + logEvent(event.copy(properties = redactProperties(event.properties))) if (shouldLogStageExecutorMetrics) { // record the peak metrics for the new stage liveStageExecutorMetrics.put((event.stageInfo.stageId, event.stageInfo.attemptNumber()), @@ -156,7 +158,9 @@ private[spark] class EventLoggingListener( logEvent(event, flushLogger = true) } - override def onJobStart(event: SparkListenerJobStart): Unit = logEvent(event, flushLogger = true) + override def onJobStart(event: SparkListenerJobStart): Unit = { + logEvent(event.copy(properties = redactProperties(event.properties)), flushLogger = true) + } override def onJobEnd(event: SparkListenerJobEnd): Unit = logEvent(event, flushLogger = true) @@ -276,6 +280,22 @@ private[spark] class EventLoggingListener( logWriter.stop() } + private def redactProperties(properties: Properties): Properties = { + if (properties == null) { + return properties + } + val redactedProperties = new Properties + // properties may contain some custom local properties such as stage/job description + // only properties in sparkConf need to be redacted. + val (globalProperties, localProperties) = properties.asScala.toSeq.partition { + case (key, _) => sparkConf.contains(key) + } + (Utils.redact(sparkConf, globalProperties) ++ localProperties).foreach { + case (key, value) => redactedProperties.setProperty(key, value) + } + redactedProperties + } + private[spark] def redactEvent( event: SparkListenerEnvironmentUpdate): SparkListenerEnvironmentUpdate = { // environmentDetails maps a string descriptor to a set of properties diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala index c4a8bcbb26a1d..7acb8451e3b38 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.scheduler import java.io.{File, InputStream} -import java.util.Arrays +import java.util.{Arrays, Properties} import scala.collection.immutable.Map import scala.collection.mutable @@ -98,6 +98,68 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit assert(redactedProps(key) == "*********(redacted)") } + test("Spark-33504 sensitive attributes redaction in properties") { + val (secretKey, secretPassword) = ("spark.executorEnv.HADOOP_CREDSTORE_PASSWORD", + "secret_password") + val (customKey, customValue) = ("parse_token", "secret_password") + + val conf = getLoggingConf(testDirPath, None).set(secretKey, secretPassword) + + val properties = new Properties() + properties.setProperty(secretKey, secretPassword) + properties.setProperty(customKey, customValue) + + val logName = "properties-reaction-test" + val eventLogger = new EventLoggingListener(logName, None, testDirPath.toUri(), conf) + val listenerBus = new LiveListenerBus(conf) + + val stageId = 1 + val jobId = 1 + val stageInfo = new StageInfo(stageId, 0, stageId.toString, 0, + Seq.empty, Seq.empty, "details", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) + + val events = Array(SparkListenerStageSubmitted(stageInfo, properties), + SparkListenerJobStart(jobId, 0, Seq(stageInfo), properties)) + + eventLogger.start() + listenerBus.start(Mockito.mock(classOf[SparkContext]), Mockito.mock(classOf[MetricsSystem])) + listenerBus.addToEventLogQueue(eventLogger) + events.foreach(event => listenerBus.post(event)) + listenerBus.stop() + eventLogger.stop() + + val logData = EventLogFileReader.openEventLog(new Path(eventLogger.logWriter.logPath), + fileSystem) + try { + val lines = readLines(logData) + val logStart = SparkListenerLogStart(SPARK_VERSION) + assert(lines.size === 3) + assert(lines(0).contains("SparkListenerLogStart")) + assert(lines(1).contains("SparkListenerStageSubmitted")) + assert(lines(2).contains("SparkListenerJobStart")) + + lines.foreach{ + line => JsonProtocol.sparkEventFromJson(parse(line)) match { + case logStartEvent: SparkListenerLogStart => + assert(logStartEvent == logStart) + + case stageSubmittedEvent: SparkListenerStageSubmitted => + assert(stageSubmittedEvent.properties.getProperty(secretKey) == "*********(redacted)") + assert(stageSubmittedEvent.properties.getProperty(customKey) == customValue) + + case jobStartEvent : SparkListenerJobStart => + assert(jobStartEvent.properties.getProperty(secretKey) == "*********(redacted)") + assert(jobStartEvent.properties.getProperty(customKey) == customValue) + + case _ => assert(false) + } + } + } finally { + logData.close() + } + } + test("Executor metrics update") { testStageExecutorMetricsEventLogging() } From df8d3f1bf779ce1a9f3520939ab85814f09b48b7 Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Wed, 2 Dec 2020 16:03:08 +0000 Subject: [PATCH 077/150] [SPARK-33544][SQL][FOLLOW-UP] Rename NoSideEffect to NoThrow and clarify the documentation more ### What changes were proposed in this pull request? This PR is a followup of https://github.com/apache/spark/pull/30504. It proposes: - Rename `NoSideEffect` to `NoThrow`, and use `Expression.deterministic` together where it is used. - Clarify, in the docs in the expressions, that it means they don't throw exceptions ### Why are the changes needed? `NoSideEffect` virtually means that `Expression.eval` does not throw an exception, and the expressions are deterministic. It's best to be explicit so `NoThrow` was proposed - I looked if there's a similar name to represent this concept and borrowed the name of [nothrow](https://clang.llvm.org/docs/AttributeReference.html#nothrow). For determinism, we already have a way to note it under `Expression.deterministic`. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Manually ran the existing unittests written. Closes #30570 from HyukjinKwon/SPARK-33544. Authored-by: HyukjinKwon Signed-off-by: Wenchen Fan --- .../expressions/complexTypeCreator.scala | 18 ++++++++++++------ .../sql/catalyst/optimizer/expressions.scala | 2 +- .../optimizer/ConstantFoldingSuite.scala | 2 +- .../InferFiltersFromGenerateSuite.scala | 6 +++--- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index f0f92e2d935f1..cb59fbda2b3b9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -31,10 +31,16 @@ import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String /** - * Trait to indicate the expression doesn't have any side effects. This can be used - * to indicate its ok to optimize it out under certain circumstances. + * Trait to indicate the expression does not throw an exception by itself when they are evaluated. + * For example, UDFs, [[AssertTrue]], etc can throw an exception when they are executed. + * In such case, it is necessary to call [[Expression.eval]], and the optimization rule should + * not ignore it. + * + * This trait can be used in an optimization rule such as + * [[org.apache.spark.sql.catalyst.optimizer.ConstantFolding]] to fold the expressions that + * do not need to execute, for example, `size(array(c0, c1, c2))`. */ -trait NoSideEffect +trait NoThrow /** * Returns an Array containing the evaluation of all children expressions. @@ -48,7 +54,7 @@ trait NoSideEffect """, since = "1.1.0") case class CreateArray(children: Seq[Expression], useStringTypeWhenEmpty: Boolean) - extends Expression with NoSideEffect { + extends Expression with NoThrow { def this(children: Seq[Expression]) = { this(children, SQLConf.get.getConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE)) @@ -166,7 +172,7 @@ private [sql] object GenArrayData { """, since = "2.0.0") case class CreateMap(children: Seq[Expression], useStringTypeWhenEmpty: Boolean) - extends Expression with NoSideEffect{ + extends Expression with NoThrow { def this(children: Seq[Expression]) = { this(children, SQLConf.get.getConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE)) @@ -385,7 +391,7 @@ object CreateStruct { """, since = "1.5.0") // scalastyle:on line.size.limit -case class CreateNamedStruct(children: Seq[Expression]) extends Expression with NoSideEffect { +case class CreateNamedStruct(children: Seq[Expression]) extends Expression with NoThrow { lazy val (nameExprs, valExprs) = children.grouped(2).map { case Seq(name, value) => (name, value) }.toList.unzip diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala index 4725f49340451..1b1e2ad71e7c8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala @@ -45,7 +45,7 @@ object ConstantFolding extends Rule[LogicalPlan] { private def hasNoSideEffect(e: Expression): Boolean = e match { case _: Attribute => true case _: Literal => true - case _: NoSideEffect => e.children.forall(hasNoSideEffect) + case _: NoThrow if e.deterministic => e.children.forall(hasNoSideEffect) case _ => false } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala index fd9b58a7a06aa..ae644c1110740 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConstantFoldingSuite.scala @@ -264,7 +264,7 @@ class ConstantFoldingSuite extends PlanTest { comparePlans(optimized, correctAnswer) } - test("SPARK-33544: Constant folding test with sideaffects") { + test("SPARK-33544: Constant folding test with side effects") { val originalQuery = testRelation .select('a) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromGenerateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromGenerateSuite.scala index c6fa1bd6e415c..93a1d414ed403 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromGenerateSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromGenerateSuite.scala @@ -90,13 +90,13 @@ class InferFiltersFromGenerateSuite extends PlanTest { Seq(Explode(_), PosExplode(_)).foreach { f => val createArrayExplode = f(CreateArray(Seq('c1))) - test("Don't infer filters from CreateArray " + createArrayExplode) { + test("SPARK-33544: Don't infer filters from CreateArray " + createArrayExplode) { val originalQuery = testRelation.generate(createArrayExplode).analyze val optimized = OptimizeInferAndConstantFold.execute(originalQuery) comparePlans(optimized, originalQuery) } val createMapExplode = f(CreateMap(Seq('c1, 'c2))) - test("Don't infer filters from CreateMap " + createMapExplode) { + test("SPARK-33544: Don't infer filters from CreateMap " + createMapExplode) { val originalQuery = testRelation.generate(createMapExplode).analyze val optimized = OptimizeInferAndConstantFold.execute(originalQuery) comparePlans(optimized, originalQuery) @@ -105,7 +105,7 @@ class InferFiltersFromGenerateSuite extends PlanTest { Seq(Inline(_)).foreach { f => val createArrayStructExplode = f(CreateArray(Seq(CreateStruct(Seq('c1))))) - test("Don't infer filters from CreateArray " + createArrayStructExplode) { + test("SPARK-33544: Don't infer filters from CreateArray " + createArrayStructExplode) { val originalQuery = testRelation.generate(createArrayStructExplode).analyze val optimized = OptimizeInferAndConstantFold.execute(originalQuery) comparePlans(optimized, originalQuery) From 58583f7c3fdcac1232607a7ab4b0d052320ac3ea Mon Sep 17 00:00:00 2001 From: "xuewei.linxuewei" Date: Wed, 2 Dec 2020 16:10:45 +0000 Subject: [PATCH 078/150] [SPARK-33619][SQL] Fix GetMapValueUtil code generation error ### What changes were proposed in this pull request? Code Gen bug fix that introduced by SPARK-33460 ``` GetMapValueUtil s"""throw new NoSuchElementException("Key " + $eval2 + " does not exist.");""" SHOULD BE s"""throw new java.util.NoSuchElementException("Key " + $eval2 + " does not exist.");""" ``` And the reason why SPARK-33460 failed to detect this bug via UT, it was because that `checkExceptionInExpression ` did not work as expect like `checkEvaluation` which will try eval expression with BOTH `CODEGEN_ONLY` and `NO_CODEGEN` mode, and in this PR, will also fix this Test bug, too. ### Why are the changes needed? Bug Fix. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Add UT and Existing UT. Closes #30560 from leanken/leanken-SPARK-33619. Authored-by: xuewei.linxuewei Signed-off-by: Wenchen Fan --- .../expressions/complexTypeExtractors.scala | 2 +- .../expressions/datetimeExpressions.scala | 7 ++- .../expressions/intervalExpressions.scala | 14 +++--- .../expressions/ExpressionEvalHelper.scala | 49 ++++++------------- .../ExpressionEvalHelperSuite.scala | 25 +++++++++- .../IntervalExpressionsSuite.scala | 36 +++++++------- .../expressions/MathExpressionsSuite.scala | 5 +- 7 files changed, 70 insertions(+), 68 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala index 767650d022200..ef247efbe1a04 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala @@ -394,7 +394,7 @@ trait GetMapValueUtil extends BinaryExpression with ImplicitCastInputTypes { val keyJavaType = CodeGenerator.javaType(keyType) nullSafeCodeGen(ctx, ev, (eval1, eval2) => { val keyNotFoundBranch = if (failOnError) { - s"""throw new NoSuchElementException("Key " + $eval2 + " does not exist.");""" + s"""throw new java.util.NoSuchElementException("Key " + $eval2 + " does not exist.");""" } else { s"${ev.isNull} = true;" } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index bbf1e4657f351..424887a13cb97 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -1789,8 +1789,11 @@ private case class GetTimestamp( """, group = "datetime_funcs", since = "3.0.0") -case class MakeDate(year: Expression, month: Expression, day: Expression, - failOnError: Boolean = SQLConf.get.ansiEnabled) +case class MakeDate( + year: Expression, + month: Expression, + day: Expression, + failOnError: Boolean = SQLConf.get.ansiEnabled) extends TernaryExpression with ImplicitCastInputTypes with NullIntolerant { def this(year: Expression, month: Expression, day: Expression) = diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/intervalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/intervalExpressions.scala index 6219457bba994..27067e17e7f45 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/intervalExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/intervalExpressions.scala @@ -166,13 +166,13 @@ case class MakeInterval( extends SeptenaryExpression with ImplicitCastInputTypes with NullIntolerant { def this( - years: Expression, - months: Expression, - weeks: Expression, - days: Expression, - hours: Expression, - mins: Expression, - sec: Expression) = { + years: Expression, + months: Expression, + weeks: Expression, + days: Expression, + hours: Expression, + mins: Expression, + sec: Expression) = { this(years, months, weeks, days, hours, mins, sec, SQLConf.get.ansiEnabled) } def this( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala index 842c8f3243f2a..70eb391ad6e05 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala @@ -36,7 +36,6 @@ import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project} import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, MapData} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ -import org.apache.spark.util.Utils /** * A few helper functions for expression evaluation testing. Mixin this trait to use them. @@ -160,9 +159,14 @@ trait ExpressionEvalHelper extends ScalaCheckDrivenPropertyChecks with PlanTestB expectedErrMsg: String): Unit = { def checkException(eval: => Unit, testMode: String): Unit = { + val modes = Seq(CodegenObjectFactoryMode.CODEGEN_ONLY, CodegenObjectFactoryMode.NO_CODEGEN) withClue(s"($testMode)") { val errMsg = intercept[T] { - eval + for (fallbackMode <- modes) { + withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> fallbackMode.toString) { + eval + } + } }.getMessage if (errMsg == null) { if (expectedErrMsg != null) { @@ -192,22 +196,6 @@ trait ExpressionEvalHelper extends ScalaCheckDrivenPropertyChecks with PlanTestB expression.eval(inputRow) } - protected def generateProject( - generator: => Projection, - expression: Expression): Projection = { - try { - generator - } catch { - case e: Throwable => - fail( - s""" - |Code generation of $expression failed: - |$e - |${Utils.exceptionString(e)} - """.stripMargin) - } - } - protected def checkEvaluationWithoutCodegen( expression: Expression, expected: Any, @@ -244,9 +232,7 @@ trait ExpressionEvalHelper extends ScalaCheckDrivenPropertyChecks with PlanTestB protected def evaluateWithMutableProjection( expression: => Expression, inputRow: InternalRow = EmptyRow): Any = { - val plan = generateProject( - MutableProjection.create(Alias(expression, s"Optimized($expression)")() :: Nil), - expression) + val plan = MutableProjection.create(Alias(expression, s"Optimized($expression)")() :: Nil) plan.initialize(0) plan(inputRow).get(0, expression.dataType) @@ -292,11 +278,9 @@ trait ExpressionEvalHelper extends ScalaCheckDrivenPropertyChecks with PlanTestB // SPARK-16489 Explicitly doing code generation twice so code gen will fail if // some expression is reusing variable names across different instances. // This behavior is tested in ExpressionEvalHelperSuite. - val plan = generateProject( - UnsafeProjection.create( - Alias(expression, s"Optimized($expression)1")() :: - Alias(expression, s"Optimized($expression)2")() :: Nil), - expression) + val plan = UnsafeProjection.create( + Alias(expression, s"Optimized($expression)1")() :: + Alias(expression, s"Optimized($expression)2")() :: Nil) plan.initialize(0) plan(inputRow) @@ -319,16 +303,13 @@ trait ExpressionEvalHelper extends ScalaCheckDrivenPropertyChecks with PlanTestB checkEvaluationWithMutableProjection(expression, expected) checkEvaluationWithOptimization(expression, expected) - var plan = generateProject( - GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil), - expression) + var plan: Projection = + GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil) plan.initialize(0) var actual = plan(inputRow).get(0, expression.dataType) assert(checkResult(actual, expected, expression)) - plan = generateProject( - GenerateUnsafeProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil), - expression) + plan = GenerateUnsafeProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil) plan.initialize(0) val ref = new BoundReference(0, expression.dataType, nullable = true) actual = GenerateSafeProjection.generate(ref :: Nil)(plan(inputRow)).get(0, expression.dataType) @@ -456,9 +437,7 @@ trait ExpressionEvalHelper extends ScalaCheckDrivenPropertyChecks with PlanTestB } } - val plan = generateProject( - GenerateMutableProjection.generate(Alias(expr, s"Optimized($expr)")() :: Nil), - expr) + val plan = GenerateMutableProjection.generate(Alias(expr, s"Optimized($expr)")() :: Nil) val (codegen, codegenExc) = try { (Some(plan(inputRow).get(0, expr.dataType)), None) } catch { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelperSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelperSuite.scala index 54ef9641bee0d..3cc50da38906e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelperSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelperSuite.scala @@ -32,8 +32,8 @@ import org.apache.spark.sql.types.{DataType, IntegerType, MapType} */ class ExpressionEvalHelperSuite extends SparkFunSuite with ExpressionEvalHelper { - test("SPARK-16489 checkEvaluation should fail if expression reuses variable names") { - val e = intercept[RuntimeException] { checkEvaluation(BadCodegenExpression(), 10) } + test("SPARK-16489: checkEvaluation should fail if expression reuses variable names") { + val e = intercept[Exception] { checkEvaluation(BadCodegenExpression(), 10) } assert(e.getMessage.contains("some_variable")) } @@ -43,6 +43,12 @@ class ExpressionEvalHelperSuite extends SparkFunSuite with ExpressionEvalHelper } assert(e.getMessage.contains("and exprNullable was")) } + + test("SPARK-33619: make sure checkExceptionInExpression work as expected") { + checkExceptionInExpression[Exception]( + BadCodegenAndEvalExpression(), + "Cannot determine simple type name \"NoSuchElementException\"") + } } /** @@ -76,3 +82,18 @@ case class MapIncorrectDataTypeExpression() extends LeafExpression with CodegenF // since values includes null, valueContainsNull must be true override def dataType: DataType = MapType(IntegerType, IntegerType, valueContainsNull = false) } + +case class BadCodegenAndEvalExpression() extends LeafExpression { + override def nullable: Boolean = false + override def eval(input: InternalRow): Any = + throw new Exception("Cannot determine simple type name \"NoSuchElementException\"") + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + // it should be java.util.NoSuchElementException in generated code. + ev.copy(code = + code""" + |int ${ev.value} = 10; + |throw new NoSuchElementException("compile failed!"); + """.stripMargin) + } + override def dataType: DataType = IntegerType +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala index 5c73a91de4f79..950637c958426 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala @@ -217,15 +217,15 @@ class IntervalExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { test("ANSI mode: make interval") { def check( - years: Int = 0, - months: Int = 0, - weeks: Int = 0, - days: Int = 0, - hours: Int = 0, - minutes: Int = 0, - seconds: Int = 0, - millis: Int = 0, - micros: Int = 0): Unit = { + years: Int = 0, + months: Int = 0, + weeks: Int = 0, + days: Int = 0, + hours: Int = 0, + minutes: Int = 0, + seconds: Int = 0, + millis: Int = 0, + micros: Int = 0): Unit = { val secFrac = DateTimeTestUtils.secFrac(seconds, millis, micros) val intervalExpr = MakeInterval(Literal(years), Literal(months), Literal(weeks), Literal(days), Literal(hours), Literal(minutes), @@ -238,15 +238,15 @@ class IntervalExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { } def checkException( - years: Int = 0, - months: Int = 0, - weeks: Int = 0, - days: Int = 0, - hours: Int = 0, - minutes: Int = 0, - seconds: Int = 0, - millis: Int = 0, - micros: Int = 0): Unit = { + years: Int = 0, + months: Int = 0, + weeks: Int = 0, + days: Int = 0, + hours: Int = 0, + minutes: Int = 0, + seconds: Int = 0, + millis: Int = 0, + micros: Int = 0): Unit = { val secFrac = DateTimeTestUtils.secFrac(seconds, millis, micros) val intervalExpr = MakeInterval(Literal(years), Literal(months), Literal(weeks), Literal(days), Literal(hours), Literal(minutes), diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala index b4096f21bea3a..6d09e28362e11 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala @@ -138,9 +138,8 @@ class MathExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { expression: Expression, inputRow: InternalRow = EmptyRow): Unit = { - val plan = generateProject( - GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil), - expression) + val plan = + GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil) val actual = plan(inputRow).get(0, expression.dataType) if (!actual.asInstanceOf[Double].isNaN) { From 91182d6cce0a56a50801d530aff0c8e3aba59e27 Mon Sep 17 00:00:00 2001 From: Prashant Sharma Date: Wed, 2 Dec 2020 08:43:30 -0800 Subject: [PATCH 079/150] [SPARK-33626][K8S][TEST] Allow k8s integration tests to assert both driver and executor logs for expected log(s) ### What changes were proposed in this pull request? Allow k8s integration tests to assert both driver and executor logs for expected log(s) ### Why are the changes needed? Some of the tests will be able to provide full coverage of the use case, by asserting both driver and executor logs. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? TBD Closes #30568 from ScrapCodes/expectedDriverLogChanges. Authored-by: Prashant Sharma Signed-off-by: Dongjoon Hyun --- .../integrationtest/DecommissionSuite.scala | 6 ++-- .../k8s/integrationtest/DepsTestsSuite.scala | 2 +- .../k8s/integrationtest/KubernetesSuite.scala | 32 ++++++++++++++++--- .../integrationtest/PythonTestsSuite.scala | 6 ++-- .../k8s/integrationtest/RTestsSuite.scala | 2 +- .../SparkConfPropagateSuite.scala | 22 ++++++------- 6 files changed, 47 insertions(+), 23 deletions(-) diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DecommissionSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DecommissionSuite.scala index 9d7db04bb72b0..92f6a32cd156a 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DecommissionSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DecommissionSuite.scala @@ -38,7 +38,7 @@ private[spark] trait DecommissionSuite { k8sSuite: KubernetesSuite => runSparkApplicationAndVerifyCompletion( appResource = PYSPARK_DECOMISSIONING, mainClass = "", - expectedLogOnCompletion = Seq( + expectedDriverLogOnCompletion = Seq( "Finished waiting, stopping Spark", "Decommission executors", "Final accumulator value is: 100"), @@ -69,7 +69,7 @@ private[spark] trait DecommissionSuite { k8sSuite: KubernetesSuite => runSparkApplicationAndVerifyCompletion( appResource = PYSPARK_DECOMISSIONING_CLEANUP, mainClass = "", - expectedLogOnCompletion = Seq( + expectedDriverLogOnCompletion = Seq( "Finished waiting, stopping Spark", "Decommission executors"), appArgs = Array.empty[String], @@ -104,7 +104,7 @@ private[spark] trait DecommissionSuite { k8sSuite: KubernetesSuite => runSparkApplicationAndVerifyCompletion( appResource = PYSPARK_SCALE, mainClass = "", - expectedLogOnCompletion = Seq( + expectedDriverLogOnCompletion = Seq( "Finished waiting, stopping Spark", "Decommission executors"), appArgs = Array.empty[String], diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DepsTestsSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DepsTestsSuite.scala index 8f6e9cd8af740..760e9ba55d335 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DepsTestsSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DepsTestsSuite.scala @@ -177,7 +177,7 @@ private[spark] trait DepsTestsSuite { k8sSuite: KubernetesSuite => runSparkApplicationAndVerifyCompletion( appResource = pySparkFiles, mainClass = "", - expectedLogOnCompletion = Seq( + expectedDriverLogOnCompletion = Seq( "Python runtime version check is: True", "Python environment version check is: True", "Python runtime version check for executor is: True"), diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala index cc226b341916d..193a02aad0cea 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala @@ -171,6 +171,7 @@ class KubernetesSuite extends SparkFunSuite appResource, SPARK_PI_MAIN_CLASS, Seq("Pi is roughly 3"), + Seq(), appArgs, driverPodChecker, executorPodChecker, @@ -192,6 +193,7 @@ class KubernetesSuite extends SparkFunSuite SPARK_DFS_READ_WRITE_TEST, Seq(s"Success! Local Word Count $wordCount and " + s"DFS Word Count $wordCount agree."), + Seq(), appArgs, driverPodChecker, executorPodChecker, @@ -212,6 +214,7 @@ class KubernetesSuite extends SparkFunSuite appResource, SPARK_REMOTE_MAIN_CLASS, Seq(s"Mounting of ${appArgs.head} was true"), + Seq(), appArgs, driverPodChecker, executorPodChecker, @@ -261,7 +264,8 @@ class KubernetesSuite extends SparkFunSuite protected def runSparkApplicationAndVerifyCompletion( appResource: String, mainClass: String, - expectedLogOnCompletion: Seq[String], + expectedDriverLogOnCompletion: Seq[String], + expectedExecutorLogOnCompletion: Seq[String] = Seq(), appArgs: Array[String], driverPodChecker: Pod => Unit, executorPodChecker: Pod => Unit, @@ -374,7 +378,6 @@ class KubernetesSuite extends SparkFunSuite .list() .getItems .get(0) - driverPodChecker(driverPod) // If we're testing decommissioning we an executors, but we should have an executor @@ -383,14 +386,35 @@ class KubernetesSuite extends SparkFunSuite execPods.values.nonEmpty should be (true) } execPods.values.foreach(executorPodChecker(_)) + + val execPod: Option[Pod] = if (expectedExecutorLogOnCompletion.nonEmpty) { + Some(kubernetesTestComponents.kubernetesClient + .pods() + .withLabel("spark-app-locator", appLocator) + .withLabel("spark-role", "executor") + .list() + .getItems + .get(0)) + } else { + None + } + Eventually.eventually(patienceTimeout, patienceInterval) { - expectedLogOnCompletion.foreach { e => + expectedDriverLogOnCompletion.foreach { e => assert(kubernetesTestComponents.kubernetesClient .pods() .withName(driverPod.getMetadata.getName) .getLog .contains(e), - s"The application did not complete, did not find str ${e}") + s"The application did not complete, driver log did not contain str ${e}") + } + expectedExecutorLogOnCompletion.foreach { e => + assert(kubernetesTestComponents.kubernetesClient + .pods() + .withName(execPod.get.getMetadata.getName) + .getLog + .contains(e), + s"The application did not complete, executor log did not contain str ${e}") } } execWatcher.close() diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/PythonTestsSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/PythonTestsSuite.scala index bad6f1c1021ba..457a766cae124 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/PythonTestsSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/PythonTestsSuite.scala @@ -27,7 +27,7 @@ private[spark] trait PythonTestsSuite { k8sSuite: KubernetesSuite => runSparkApplicationAndVerifyCompletion( appResource = PYSPARK_PI, mainClass = "", - expectedLogOnCompletion = Seq("Pi is roughly 3"), + expectedDriverLogOnCompletion = Seq("Pi is roughly 3"), appArgs = Array("5"), driverPodChecker = doBasicDriverPyPodCheck, executorPodChecker = doBasicExecutorPyPodCheck, @@ -41,7 +41,7 @@ private[spark] trait PythonTestsSuite { k8sSuite: KubernetesSuite => runSparkApplicationAndVerifyCompletion( appResource = PYSPARK_FILES, mainClass = "", - expectedLogOnCompletion = Seq( + expectedDriverLogOnCompletion = Seq( "Python runtime version check is: True", "Python environment version check is: True", "Python runtime version check for executor is: True"), @@ -61,7 +61,7 @@ private[spark] trait PythonTestsSuite { k8sSuite: KubernetesSuite => runSparkApplicationAndVerifyCompletion( appResource = PYSPARK_MEMORY_CHECK, mainClass = "", - expectedLogOnCompletion = Seq( + expectedDriverLogOnCompletion = Seq( "PySpark Worker Memory Check is: True"), appArgs = Array(s"$additionalMemoryInBytes"), driverPodChecker = doDriverMemoryCheck, diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/RTestsSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/RTestsSuite.scala index b7c8886a15ae7..a22066c18064c 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/RTestsSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/RTestsSuite.scala @@ -26,7 +26,7 @@ private[spark] trait RTestsSuite { k8sSuite: KubernetesSuite => runSparkApplicationAndVerifyCompletion( appResource = SPARK_R_DATAFRAME_TEST, mainClass = "", - expectedLogOnCompletion = Seq("name: string (nullable = true)", "1 Justin"), + expectedDriverLogOnCompletion = Seq("name: string (nullable = true)", "1 Justin"), appArgs = Array.empty[String], driverPodChecker = doBasicDriverRPodCheck, executorPodChecker = doBasicExecutorRPodCheck, diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/SparkConfPropagateSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/SparkConfPropagateSuite.scala index 6d15201d19796..5d3b426598fdd 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/SparkConfPropagateSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/SparkConfPropagateSuite.scala @@ -16,14 +16,11 @@ */ package org.apache.spark.deploy.k8s.integrationtest -import java.io.{BufferedWriter, File, FileWriter} +import java.io.File import java.net.URL +import java.nio.file.Files -import scala.io.{BufferedSource, Source} - -import io.fabric8.kubernetes.api.model._ - -import org.apache.spark.internal.config +import scala.io.Source private[spark] trait SparkConfPropagateSuite { k8sSuite: KubernetesSuite => import KubernetesSuite.{k8sTestTag, SPARK_PI_MAIN_CLASS} @@ -38,18 +35,21 @@ private[spark] trait SparkConfPropagateSuite { k8sSuite: KubernetesSuite => val logConfFilePath = s"${sparkHomeDir.toFile}/conf/log4j.properties" try { - val writer = new BufferedWriter(new FileWriter(logConfFilePath)) - writer.write(content) - writer.close() + Files.write(new File(logConfFilePath).toPath, content.getBytes) sparkAppConf.set("spark.driver.extraJavaOptions", "-Dlog4j.debug") + sparkAppConf.set("spark.executor.extraJavaOptions", "-Dlog4j.debug") + + val log4jExpectedLog = + s"log4j: Reading configuration from URL file:/opt/spark/conf/log4j.properties" runSparkApplicationAndVerifyCompletion( appResource = containerLocalSparkDistroExamplesJar, mainClass = SPARK_PI_MAIN_CLASS, - expectedLogOnCompletion = (Seq("DEBUG", - s"log4j: Reading configuration from URL file:/opt/spark/conf/log4j.properties", + expectedDriverLogOnCompletion = (Seq("DEBUG", + log4jExpectedLog, "Pi is roughly 3")), + expectedExecutorLogOnCompletion = Seq(log4jExpectedLog), appArgs = Array.empty[String], driverPodChecker = doBasicDriverPodCheck, executorPodChecker = doBasicExecutorPodCheck, From a082f4600b1cb814442beed1b578bc3430a257a7 Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Wed, 2 Dec 2020 17:51:22 +0000 Subject: [PATCH 080/150] [SPARK-33071][SPARK-33536][SQL] Avoid changing dataset_id of LogicalPlan in join() to not break DetectAmbiguousSelfJoin ### What changes were proposed in this pull request? Currently, `join()` uses `withPlan(logicalPlan)` for convenient to call some Dataset functions. But it leads to the `dataset_id` inconsistent between the `logicalPlan` and the original `Dataset`(because `withPlan(logicalPlan)` will create a new Dataset with the new id and reset the `dataset_id` with the new id of the `logicalPlan`). As a result, it breaks the rule `DetectAmbiguousSelfJoin`. In this PR, we propose to drop the usage of `withPlan` but use the `logicalPlan` directly so its `dataset_id` doesn't change. Besides, this PR also removes related metadata (`DATASET_ID_KEY`, `COL_POS_KEY`) when an `Alias` tries to construct its own metadata. Because the `Alias` is no longer a reference column after converting to an `Attribute`. To achieve that, we add a new field, `deniedMetadataKeys`, to indicate the metadata that needs to be removed. ### Why are the changes needed? For the query below, it returns the wrong result while it should throws ambiguous self join exception instead: ```scala val emp1 = Seq[TestData]( TestData(1, "sales"), TestData(2, "personnel"), TestData(3, "develop"), TestData(4, "IT")).toDS() val emp2 = Seq[TestData]( TestData(1, "sales"), TestData(2, "personnel"), TestData(3, "develop")).toDS() val emp3 = emp1.join(emp2, emp1("key") === emp2("key")).select(emp1("*")) emp1.join(emp3, emp1.col("key") === emp3.col("key"), "left_outer") .select(emp1.col("*"), emp3.col("key").as("e2")).show() // wrong result +---+---------+---+ |key| value| e2| +---+---------+---+ | 1| sales| 1| | 2|personnel| 2| | 3| develop| 3| | 4| IT| 4| +---+---------+---+ ``` This PR fixes the wrong behaviour. ### Does this PR introduce _any_ user-facing change? Yes, users hit the exception instead of the wrong result after this PR. ### How was this patch tested? Added a new unit test. Closes #30488 from Ngone51/fix-self-join. Authored-by: yi.wu Signed-off-by: Wenchen Fan --- .../catalyst/expressions/AliasHelper.scala | 3 +- .../expressions/namedExpressions.scala | 15 +++++-- .../scala/org/apache/spark/sql/Column.scala | 5 ++- .../scala/org/apache/spark/sql/Dataset.scala | 39 +++++++++++-------- .../spark/sql/DataFrameSelfJoinSuite.scala | 29 ++++++++++++++ .../sql/SparkSessionExtensionSuite.scala | 7 ++-- 6 files changed, 73 insertions(+), 25 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala index ec47875754a6f..c61eb68db5bfa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala @@ -89,7 +89,8 @@ trait AliasHelper { a.copy(child = trimAliases(a.child))( exprId = a.exprId, qualifier = a.qualifier, - explicitMetadata = Some(a.metadata)) + explicitMetadata = Some(a.metadata), + deniedMetadataKeys = a.deniedMetadataKeys) case a: MultiAlias => a.copy(child = trimAliases(a.child)) case other => trimAliases(other) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala index 2abd9d7bb4423..22aabd3c6b30b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala @@ -143,11 +143,14 @@ abstract class Attribute extends LeafExpression with NamedExpression with NullIn * fully qualified way. Consider the examples tableName.name, subQueryAlias.name. * tableName and subQueryAlias are possible qualifiers. * @param explicitMetadata Explicit metadata associated with this alias that overwrites child's. + * @param deniedMetadataKeys Keys of metadata entries that are supposed to be removed when + * inheriting the metadata from the child. */ case class Alias(child: Expression, name: String)( val exprId: ExprId = NamedExpression.newExprId, val qualifier: Seq[String] = Seq.empty, - val explicitMetadata: Option[Metadata] = None) + val explicitMetadata: Option[Metadata] = None, + val deniedMetadataKeys: Seq[String] = Seq.empty) extends UnaryExpression with NamedExpression { // Alias(Generator, xx) need to be transformed into Generate(generator, ...) @@ -167,7 +170,11 @@ case class Alias(child: Expression, name: String)( override def metadata: Metadata = { explicitMetadata.getOrElse { child match { - case named: NamedExpression => named.metadata + case named: NamedExpression => + val builder = new MetadataBuilder().withMetadata(named.metadata) + deniedMetadataKeys.foreach(builder.remove) + builder.build() + case _ => Metadata.empty } } @@ -194,7 +201,7 @@ case class Alias(child: Expression, name: String)( override def toString: String = s"$child AS $name#${exprId.id}$typeSuffix$delaySuffix" override protected final def otherCopyArgs: Seq[AnyRef] = { - exprId :: qualifier :: explicitMetadata :: Nil + exprId :: qualifier :: explicitMetadata :: deniedMetadataKeys :: Nil } override def hashCode(): Int = { @@ -205,7 +212,7 @@ case class Alias(child: Expression, name: String)( override def equals(other: Any): Boolean = other match { case a: Alias => name == a.name && exprId == a.exprId && child == a.child && qualifier == a.qualifier && - explicitMetadata == a.explicitMetadata + explicitMetadata == a.explicitMetadata && deniedMetadataKeys == a.deniedMetadataKeys case _ => false } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index 95134d9111593..86ba81340272b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -1164,7 +1164,10 @@ class Column(val expr: Expression) extends Logging { * @since 2.0.0 */ def name(alias: String): Column = withExpr { - Alias(normalizedExpr(), alias)() + // SPARK-33536: The Alias is no longer a column reference after converting to an attribute. + // These denied metadata keys are used to strip the column reference related metadata for + // the Alias. So it won't be caught as a column reference in DetectAmbiguousSelfJoin. + Alias(expr, alias)(deniedMetadataKeys = Seq(Dataset.DATASET_ID_KEY, Dataset.COL_POS_KEY)) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 2c38a65ac2106..0716043bcf660 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -231,7 +231,8 @@ class Dataset[T] private[sql]( case _ => queryExecution.analyzed } - if (sparkSession.sessionState.conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED)) { + if (sparkSession.sessionState.conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED) && + plan.getTagValue(Dataset.DATASET_ID_TAG).isEmpty) { plan.setTagValue(Dataset.DATASET_ID_TAG, id) } plan @@ -259,15 +260,16 @@ class Dataset[T] private[sql]( private[sql] def resolve(colName: String): NamedExpression = { val resolver = sparkSession.sessionState.analyzer.resolver queryExecution.analyzed.resolveQuoted(colName, resolver) - .getOrElse { - val fields = schema.fieldNames - val extraMsg = if (fields.exists(resolver(_, colName))) { - s"; did you mean to quote the `$colName` column?" - } else "" - val fieldsStr = fields.mkString(", ") - val errorMsg = s"""Cannot resolve column name "$colName" among (${fieldsStr})${extraMsg}""" - throw new AnalysisException(errorMsg) - } + .getOrElse(throw resolveException(colName, schema.fieldNames)) + } + + private def resolveException(colName: String, fields: Array[String]): AnalysisException = { + val extraMsg = if (fields.exists(sparkSession.sessionState.analyzer.resolver(_, colName))) { + s"; did you mean to quote the `$colName` column?" + } else "" + val fieldsStr = fields.mkString(", ") + val errorMsg = s"""Cannot resolve column name "$colName" among (${fieldsStr})${extraMsg}""" + new AnalysisException(errorMsg) } private[sql] def numericColumns: Seq[Expression] = { @@ -1083,8 +1085,8 @@ class Dataset[T] private[sql]( } // If left/right have no output set intersection, return the plan. - val lanalyzed = withPlan(this.logicalPlan).queryExecution.analyzed - val ranalyzed = withPlan(right.logicalPlan).queryExecution.analyzed + val lanalyzed = this.queryExecution.analyzed + val ranalyzed = right.queryExecution.analyzed if (lanalyzed.outputSet.intersect(ranalyzed.outputSet).isEmpty) { return withPlan(plan) } @@ -1092,17 +1094,22 @@ class Dataset[T] private[sql]( // Otherwise, find the trivially true predicates and automatically resolves them to both sides. // By the time we get here, since we have already run analysis, all attributes should've been // resolved and become AttributeReference. + val resolver = sparkSession.sessionState.analyzer.resolver val cond = plan.condition.map { _.transform { case catalyst.expressions.EqualTo(a: AttributeReference, b: AttributeReference) if a.sameRef(b) => catalyst.expressions.EqualTo( - withPlan(plan.left).resolve(a.name), - withPlan(plan.right).resolve(b.name)) + plan.left.resolveQuoted(a.name, resolver) + .getOrElse(throw resolveException(a.name, plan.left.schema.fieldNames)), + plan.right.resolveQuoted(b.name, resolver) + .getOrElse(throw resolveException(b.name, plan.right.schema.fieldNames))) case catalyst.expressions.EqualNullSafe(a: AttributeReference, b: AttributeReference) if a.sameRef(b) => catalyst.expressions.EqualNullSafe( - withPlan(plan.left).resolve(a.name), - withPlan(plan.right).resolve(b.name)) + plan.left.resolveQuoted(a.name, resolver) + .getOrElse(throw resolveException(a.name, plan.left.schema.fieldNames)), + plan.right.resolveQuoted(b.name, resolver) + .getOrElse(throw resolveException(b.name, plan.right.schema.fieldNames))) }} withPlan { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala index 3b3b54f75da57..50846d9d12b97 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions.{count, sum} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.test.SQLTestData.TestData class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { import testImplicits._ @@ -219,4 +220,32 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { Seq((1, 2), (1, 2), (2, 4), (2, 4)).map(Row.fromTuple)) } } + + test("SPARK-33071/SPARK-33536: Avoid changing dataset_id of LogicalPlan in join() " + + "to not break DetectAmbiguousSelfJoin") { + val emp1 = Seq[TestData]( + TestData(1, "sales"), + TestData(2, "personnel"), + TestData(3, "develop"), + TestData(4, "IT")).toDS() + val emp2 = Seq[TestData]( + TestData(1, "sales"), + TestData(2, "personnel"), + TestData(3, "develop")).toDS() + val emp3 = emp1.join(emp2, emp1("key") === emp2("key")).select(emp1("*")) + assertAmbiguousSelfJoin(emp1.join(emp3, emp1.col("key") === emp3.col("key"), + "left_outer").select(emp1.col("*"), emp3.col("key").as("e2"))) + } + + test("df.show() should also not change dataset_id of LogicalPlan") { + val df = Seq[TestData]( + TestData(1, "sales"), + TestData(2, "personnel"), + TestData(3, "develop"), + TestData(4, "IT")).toDF() + val ds_id1 = df.logicalPlan.getTagValue(Dataset.DATASET_ID_TAG) + df.show(0) + val ds_id2 = df.logicalPlan.getTagValue(Dataset.DATASET_ID_TAG) + assert(ds_id1 === ds_id2) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala index 12abd31b99e93..f02d2041dd7f3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala @@ -573,8 +573,9 @@ class ColumnarBoundReference(ordinal: Int, dataType: DataType, nullable: Boolean class ColumnarAlias(child: ColumnarExpression, name: String)( override val exprId: ExprId = NamedExpression.newExprId, override val qualifier: Seq[String] = Seq.empty, - override val explicitMetadata: Option[Metadata] = None) - extends Alias(child, name)(exprId, qualifier, explicitMetadata) + override val explicitMetadata: Option[Metadata] = None, + override val deniedMetadataKeys: Seq[String] = Seq.empty) + extends Alias(child, name)(exprId, qualifier, explicitMetadata, deniedMetadataKeys) with ColumnarExpression { override def columnarEval(batch: ColumnarBatch): Any = child.columnarEval(batch) @@ -711,7 +712,7 @@ case class PreRuleReplaceAddWithBrokenVersion() extends Rule[SparkPlan] { def replaceWithColumnarExpression(exp: Expression): ColumnarExpression = exp match { case a: Alias => new ColumnarAlias(replaceWithColumnarExpression(a.child), - a.name)(a.exprId, a.qualifier, a.explicitMetadata) + a.name)(a.exprId, a.qualifier, a.explicitMetadata, a.deniedMetadataKeys) case att: AttributeReference => new ColumnarAttributeReference(att.name, att.dataType, att.nullable, att.metadata)(att.exprId, att.qualifier) From b76c6b759c8dd549290aa174b62b8d34ea34aa3f Mon Sep 17 00:00:00 2001 From: Gengliang Wang Date: Wed, 2 Dec 2020 12:44:39 -0800 Subject: [PATCH 081/150] [SPARK-33627][SQL] Add new function UNIX_SECONDS, UNIX_MILLIS and UNIX_MICROS ### What changes were proposed in this pull request? As https://github.com/apache/spark/pull/28534 adds functions from [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/timestamp_functions) for converting numbers to timestamp, this PR is to add functions UNIX_SECONDS, UNIX_MILLIS and UNIX_MICROS for converting timestamp to numbers. ### Why are the changes needed? 1. Symmetry of the conversion functions 2. Casting timestamp type to numeric types is disallowed in ANSI mode, we should provide functions for users to complete the conversion. ### Does this PR introduce _any_ user-facing change? 3 new functions UNIX_SECONDS, UNIX_MILLIS and UNIX_MICROS for converting timestamp to long type. ### How was this patch tested? Unit tests. Closes #30566 from gengliangwang/timestampLong. Authored-by: Gengliang Wang Signed-off-by: Dongjoon Hyun --- .../catalyst/analysis/FunctionRegistry.scala | 3 + .../expressions/datetimeExpressions.scala | 73 +++++++++++++++++++ .../expressions/DateExpressionsSuite.scala | 45 ++++++++++++ .../sql-functions/sql-expression-schema.md | 5 +- .../resources/sql-tests/inputs/datetime.sql | 4 + .../sql-tests/results/ansi/datetime.sql.out | 26 ++++++- .../sql-tests/results/datetime-legacy.sql.out | 26 ++++++- .../sql-tests/results/datetime.sql.out | 26 ++++++- 8 files changed, 204 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 6fb9bed9625d5..5c2816a0baa95 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -430,6 +430,9 @@ object FunctionRegistry { expression[SecondsToTimestamp]("timestamp_seconds"), expression[MillisToTimestamp]("timestamp_millis"), expression[MicrosToTimestamp]("timestamp_micros"), + expression[UnixSeconds]("unix_seconds"), + expression[UnixMillis]("unix_millis"), + expression[UnixMicros]("unix_micros"), // collection functions expression[CreateArray]("array"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 424887a13cb97..60dc32c1571fe 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -524,6 +524,79 @@ case class MicrosToTimestamp(child: Expression) override def prettyName: String = "timestamp_micros" } +abstract class TimestampToLongBase extends UnaryExpression + with ExpectsInputTypes with NullIntolerant { + + protected def scaleFactor: Long + + override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType) + + override def dataType: DataType = LongType + + override def nullSafeEval(input: Any): Any = { + Math.floorDiv(input.asInstanceOf[Number].longValue(), scaleFactor) + } + + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + if (scaleFactor == 1) { + defineCodeGen(ctx, ev, c => c) + } else { + defineCodeGen(ctx, ev, c => s"java.lang.Math.floorDiv($c, ${scaleFactor}L)") + } + } +} + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(timestamp) - Returns the number of seconds since 1970-01-01 00:00:00 UTC. Truncates higher levels of precision.", + examples = """ + Examples: + > SELECT _FUNC_(TIMESTAMP('1970-01-01 00:00:01Z')); + 1 + """, + group = "datetime_funcs", + since = "3.1.0") +// scalastyle:on line.size.limit +case class UnixSeconds(child: Expression) extends TimestampToLongBase { + override def scaleFactor: Long = MICROS_PER_SECOND + + override def prettyName: String = "unix_seconds" +} + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(timestamp) - Returns the number of milliseconds since 1970-01-01 00:00:00 UTC. Truncates higher levels of precision.", + examples = """ + Examples: + > SELECT _FUNC_(TIMESTAMP('1970-01-01 00:00:01Z')); + 1000 + """, + group = "datetime_funcs", + since = "3.1.0") +// scalastyle:on line.size.limit +case class UnixMillis(child: Expression) extends TimestampToLongBase { + override def scaleFactor: Long = MICROS_PER_MILLIS + + override def prettyName: String = "unix_millis" +} + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(timestamp) - Returns the number of microseconds since 1970-01-01 00:00:00 UTC.", + examples = """ + Examples: + > SELECT _FUNC_(TIMESTAMP('1970-01-01 00:00:01Z')); + 1000000 + """, + group = "datetime_funcs", + since = "3.1.0") +// scalastyle:on line.size.limit +case class UnixMicros(child: Expression) extends TimestampToLongBase { + override def scaleFactor: Long = 1L + + override def prettyName: String = "unix_micros" +} + @ExpressionDescription( usage = "_FUNC_(date) - Returns the year component of the date/timestamp.", examples = """ diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala index 587ca0cdbed6e..8a1a34276341d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala @@ -1245,6 +1245,51 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkResult(Int.MinValue.toLong - 100) } + test("UNIX_SECONDS") { + checkEvaluation(UnixSeconds(Literal(null, TimestampType)), null) + var timestamp = Literal(new Timestamp(0L)) + checkEvaluation(UnixSeconds(timestamp), 0L) + timestamp = Literal(new Timestamp(1000L)) + checkEvaluation(UnixSeconds(timestamp), 1L) + timestamp = Literal(new Timestamp(-1000L)) + checkEvaluation(UnixSeconds(timestamp), -1L) + // -1ms is considered to be in -1st second, as 0-999ms is in 0th second. + timestamp = Literal(new Timestamp(-1L)) + checkEvaluation(UnixSeconds(timestamp), -1L) + timestamp = Literal(new Timestamp(-1000L)) + checkEvaluation(UnixSeconds(timestamp), -1L) + // Truncates higher levels of precision + timestamp = Literal(new Timestamp(1999L)) + checkEvaluation(UnixSeconds(timestamp), 1L) + } + + test("UNIX_MILLIS") { + checkEvaluation(UnixMillis(Literal(null, TimestampType)), null) + var timestamp = Literal(new Timestamp(0L)) + checkEvaluation(UnixMillis(timestamp), 0L) + timestamp = Literal(new Timestamp(1000L)) + checkEvaluation(UnixMillis(timestamp), 1000L) + timestamp = Literal(new Timestamp(-1000L)) + checkEvaluation(UnixMillis(timestamp), -1000L) + // Truncates higher levels of precision + val timestampWithNanos = new Timestamp(1000L) + timestampWithNanos.setNanos(999999) + checkEvaluation(UnixMillis(Literal(timestampWithNanos)), 1000L) + } + + test("UNIX_MICROS") { + checkEvaluation(UnixMicros(Literal(null, TimestampType)), null) + var timestamp = Literal(new Timestamp(0L)) + checkEvaluation(UnixMicros(timestamp), 0L) + timestamp = Literal(new Timestamp(1000L)) + checkEvaluation(UnixMicros(timestamp), 1000000L) + timestamp = Literal(new Timestamp(-1000L)) + checkEvaluation(UnixMicros(timestamp), -1000000L) + val timestampWithNanos = new Timestamp(1000L) + timestampWithNanos.setNanos(1000) // 1 microsecond + checkEvaluation(UnixMicros(Literal(timestampWithNanos)), 1000001L) + } + test("TIMESTAMP_SECONDS") { def testIntegralFunc(value: Number): Unit = { checkEvaluation( diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 0a54dff3a1cea..861062a1f7705 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -1,6 +1,6 @@ ## Summary - - Number of queries: 342 + - Number of queries: 345 - Number of expressions that missing example: 13 - Expressions missing examples: bigint,binary,boolean,date,decimal,double,float,int,smallint,string,timestamp,tinyint,window ## Schema of Built-in Functions @@ -289,6 +289,9 @@ | org.apache.spark.sql.catalyst.expressions.UnaryMinus | negative | SELECT negative(1) | struct | | org.apache.spark.sql.catalyst.expressions.UnaryPositive | positive | SELECT positive(1) | struct<(+ 1):int> | | org.apache.spark.sql.catalyst.expressions.Unhex | unhex | SELECT decode(unhex('537061726B2053514C'), 'UTF-8') | struct | +| org.apache.spark.sql.catalyst.expressions.UnixMicros | unix_micros | SELECT unix_micros(TIMESTAMP('1970-01-01 00:00:01Z')) | struct | +| org.apache.spark.sql.catalyst.expressions.UnixMillis | unix_millis | SELECT unix_millis(TIMESTAMP('1970-01-01 00:00:01Z')) | struct | +| org.apache.spark.sql.catalyst.expressions.UnixSeconds | unix_seconds | SELECT unix_seconds(TIMESTAMP('1970-01-01 00:00:01Z')) | struct | | org.apache.spark.sql.catalyst.expressions.UnixTimestamp | unix_timestamp | SELECT unix_timestamp() | struct | | org.apache.spark.sql.catalyst.expressions.Upper | ucase | SELECT ucase('SparkSql') | struct | | org.apache.spark.sql.catalyst.expressions.Upper | upper | SELECT upper('SparkSql') | struct | diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql index 534e222b7c13e..c2ccb3ee0db06 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql @@ -14,6 +14,10 @@ select TIMESTAMP_MILLIS(-92233720368547758); select TIMESTAMP_SECONDS(0.1234567); -- truncation is OK for float/double select TIMESTAMP_SECONDS(0.1234567d), TIMESTAMP_SECONDS(FLOAT(0.1234567)); +-- UNIX_SECONDS, UNIX_MILLISECONDS and UNIX_MICROSECONDS +select UNIX_SECONDS(TIMESTAMP('2020-12-01 14:30:08Z')), UNIX_SECONDS(TIMESTAMP('2020-12-01 14:30:08.999999Z')), UNIX_SECONDS(null); +select UNIX_MILLIS(TIMESTAMP('2020-12-01 14:30:08Z')), UNIX_MILLIS(TIMESTAMP('2020-12-01 14:30:08.999999Z')), UNIX_MILLIS(null); +select UNIX_MICROS(TIMESTAMP('2020-12-01 14:30:08Z')), UNIX_MICROS(TIMESTAMP('2020-12-01 14:30:08.999999Z')), UNIX_MICROS(null); -- [SPARK-16836] current_date and current_timestamp literals select current_date = current_date(), current_timestamp = current_timestamp(); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out index 10669f14aa87b..9d99d3b870b3f 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 117 +-- Number of queries: 120 -- !query @@ -87,6 +87,30 @@ struct +-- !query output +1606833008 1606833008 NULL + + +-- !query +select UNIX_MILLIS(TIMESTAMP('2020-12-01 14:30:08Z')), UNIX_MILLIS(TIMESTAMP('2020-12-01 14:30:08.999999Z')), UNIX_MILLIS(null) +-- !query schema +struct +-- !query output +1606833008000 1606833008999 NULL + + +-- !query +select UNIX_MICROS(TIMESTAMP('2020-12-01 14:30:08Z')), UNIX_MICROS(TIMESTAMP('2020-12-01 14:30:08.999999Z')), UNIX_MICROS(null) +-- !query schema +struct +-- !query output +1606833008000000 1606833008999999 NULL + + -- !query select current_date = current_date(), current_timestamp = current_timestamp() -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out index 7c2c62a2db496..73e9823d96a73 100644 --- a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 117 +-- Number of queries: 120 -- !query @@ -87,6 +87,30 @@ struct +-- !query output +1606833008 1606833008 NULL + + +-- !query +select UNIX_MILLIS(TIMESTAMP('2020-12-01 14:30:08Z')), UNIX_MILLIS(TIMESTAMP('2020-12-01 14:30:08.999999Z')), UNIX_MILLIS(null) +-- !query schema +struct +-- !query output +1606833008000 1606833008999 NULL + + +-- !query +select UNIX_MICROS(TIMESTAMP('2020-12-01 14:30:08Z')), UNIX_MICROS(TIMESTAMP('2020-12-01 14:30:08.999999Z')), UNIX_MICROS(null) +-- !query schema +struct +-- !query output +1606833008000000 1606833008999999 NULL + + -- !query select current_date = current_date(), current_timestamp = current_timestamp() -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out index 810ab6ef0cbfc..2c39c1291aa70 100755 --- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 117 +-- Number of queries: 120 -- !query @@ -87,6 +87,30 @@ struct +-- !query output +1606833008 1606833008 NULL + + +-- !query +select UNIX_MILLIS(TIMESTAMP('2020-12-01 14:30:08Z')), UNIX_MILLIS(TIMESTAMP('2020-12-01 14:30:08.999999Z')), UNIX_MILLIS(null) +-- !query schema +struct +-- !query output +1606833008000 1606833008999 NULL + + +-- !query +select UNIX_MICROS(TIMESTAMP('2020-12-01 14:30:08Z')), UNIX_MICROS(TIMESTAMP('2020-12-01 14:30:08.999999Z')), UNIX_MICROS(null) +-- !query schema +struct +-- !query output +1606833008000000 1606833008999999 NULL + + -- !query select current_date = current_date(), current_timestamp = current_timestamp() -- !query schema From 92bfbcb2e372e8fecfe65bc582c779d9df4036bb Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Wed, 2 Dec 2020 12:58:41 -0800 Subject: [PATCH 082/150] [SPARK-33631][DOCS][TEST] Clean up spark.core.connection.ack.wait.timeout from configuration.md ### What changes were proposed in this pull request? SPARK-9767 remove `ConnectionManager` and related files, the configuration `spark.core.connection.ack.wait.timeout` previously used by `ConnectionManager` is no longer used by other Spark code, but it still exists in the `configuration.md`. So this pr cleans up the useless configuration item spark.core.connection.ack.wait.timeout` from `configuration.md`. ### Why are the changes needed? Clean up useless configuration from `configuration.md`. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass the Jenkins or GitHub Action Closes #30569 from LuciferYang/SPARK-33631. Authored-by: yangjie01 Signed-off-by: Dongjoon Hyun --- .../spark/storage/BlockManagerReplicationSuite.scala | 2 -- docs/configuration.md | 11 ----------- 2 files changed, 13 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala index 0b673c580d71f..1e9b48102616f 100644 --- a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala @@ -95,8 +95,6 @@ trait BlockManagerReplicationBehavior extends SparkFunSuite conf.set(MEMORY_STORAGE_FRACTION, 0.999) conf.set(STORAGE_UNROLL_MEMORY_THRESHOLD, 512L) - // to make a replication attempt to inactive store fail fast - conf.set("spark.core.connection.ack.wait.timeout", "1s") // to make cached peers refresh frequently conf.set(STORAGE_CACHED_PEERS_TTL, 10) diff --git a/docs/configuration.md b/docs/configuration.md index d4d8e47645921..21506e6901263 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1919,7 +1919,6 @@ Apart from these, the following properties are also available, and may be useful 120s Default timeout for all network interactions. This config will be used in place of - spark.core.connection.ack.wait.timeout, spark.storage.blockManagerHeartbeatTimeoutMs, spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout if they are not configured. @@ -1982,16 +1981,6 @@ Apart from these, the following properties are also available, and may be useful 1.4.0 - - spark.core.connection.ack.wait.timeout - spark.network.timeout - - How long for the connection to wait for ack to occur before timing - out and giving up. To avoid unwilling timeout caused by long pause like GC, - you can set larger value. - - 1.1.1 - spark.network.maxRemoteBlockSizeFetchToMem 200m From f94cb53a90558285541090d484a6ae9938fe02e8 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Thu, 3 Dec 2020 09:34:42 +0900 Subject: [PATCH 083/150] [MINOR][INFRA] Use the latest image for GitHub Action jobs ### What changes were proposed in this pull request? Currently, GitHub Action is using two docker images. ``` $ git grep dongjoon/apache-spark-github-action-image .github/workflows/build_and_test.yml: image: dongjoon/apache-spark-github-action-image:20201015 .github/workflows/build_and_test.yml: image: dongjoon/apache-spark-github-action-image:20201025 ``` This PR aims to make it consistent by using the latest one. ``` - image: dongjoon/apache-spark-github-action-image:20201015 + image: dongjoon/apache-spark-github-action-image:20201025 ``` ### Why are the changes needed? This is for better maintainability. The image size is almost the same. ``` $ docker images | grep 202010 dongjoon/apache-spark-github-action-image 20201025 37adfa3d226a 5 weeks ago 2.18GB dongjoon/apache-spark-github-action-image 20201015 ff6fee8dc36d 6 weeks ago 2.16GB ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass the GitHub Action. Closes #30578 from dongjoon-hyun/SPARK-MINOR. Authored-by: Dongjoon Hyun Signed-off-by: HyukjinKwon --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index b2b6a38916eeb..a3bb083387f3e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -153,7 +153,7 @@ jobs: name: "Build modules: ${{ matrix.modules }}" runs-on: ubuntu-20.04 container: - image: dongjoon/apache-spark-github-action-image:20201015 + image: dongjoon/apache-spark-github-action-image:20201025 strategy: fail-fast: false matrix: From 4f9667035886a67e6c9a4e8fad2efa390e87ca68 Mon Sep 17 00:00:00 2001 From: uncleGen Date: Wed, 2 Dec 2020 17:11:51 -0800 Subject: [PATCH 084/150] [SPARK-31953][SS] Add Spark Structured Streaming History Server Support ### What changes were proposed in this pull request? Add Spark Structured Streaming History Server Support. ### Why are the changes needed? Add a streaming query history server plugin. ![image](https://user-images.githubusercontent.com/7402327/84248291-d26cfe80-ab3b-11ea-86d2-98205fa2bcc4.png) ![image](https://user-images.githubusercontent.com/7402327/84248347-e44ea180-ab3b-11ea-81de-eefe207656f2.png) ![image](https://user-images.githubusercontent.com/7402327/84248396-f0d2fa00-ab3b-11ea-9b0d-e410115471b0.png) - Follow-ups - Query duration should not update in history UI. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Update UT. Closes #28781 from uncleGen/SPARK-31953. Lead-authored-by: uncleGen Co-authored-by: Genmao Yu Co-authored-by: Yuanjian Li Signed-off-by: Shixiong Zhu --- dev/.rat-excludes | 1 + ...apache.spark.status.AppHistoryServerPlugin | 1 + .../streaming/StreamingQueryListenerBus.scala | 26 ++- .../StreamingQueryHistoryServerPlugin.scala | 43 +++++ .../ui/StreamingQueryStatusStore.scala | 53 ++++++ .../spark/sql/internal/SharedState.scala | 8 +- .../sql/streaming/StreamingQueryManager.scala | 3 +- .../sql/streaming/ui/StreamingQueryPage.scala | 44 ++--- .../ui/StreamingQueryStatisticsPage.scala | 27 +-- .../ui/StreamingQueryStatusListener.scala | 166 +++++++++++------- .../sql/streaming/ui/StreamingQueryTab.scala | 3 +- .../spark/sql/streaming/ui/UIUtils.scala | 12 +- .../spark-events/local-1596020211915 | 160 +++++++++++++++++ .../apache/spark/deploy/history/Utils.scala | 40 +++++ .../ui/StreamingQueryHistorySuite.scala | 63 +++++++ .../ui/StreamingQueryPageSuite.scala | 42 +++-- .../StreamingQueryStatusListenerSuite.scala | 159 ++++++++++++++--- 17 files changed, 693 insertions(+), 158 deletions(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/ui/StreamingQueryHistoryServerPlugin.scala create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/ui/StreamingQueryStatusStore.scala create mode 100644 sql/core/src/test/resources/spark-events/local-1596020211915 create mode 100644 sql/core/src/test/scala/org/apache/spark/deploy/history/Utils.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryHistorySuite.scala diff --git a/dev/.rat-excludes b/dev/.rat-excludes index 7da330dfe1fbf..167cf224f92c2 100644 --- a/dev/.rat-excludes +++ b/dev/.rat-excludes @@ -123,6 +123,7 @@ SessionHandler.java GangliaReporter.java application_1578436911597_0052 config.properties +local-1596020211915 app-20200706201101-0003 py.typed _metadata diff --git a/sql/core/src/main/resources/META-INF/services/org.apache.spark.status.AppHistoryServerPlugin b/sql/core/src/main/resources/META-INF/services/org.apache.spark.status.AppHistoryServerPlugin index 0bba2f88b92a5..6771eef525307 100644 --- a/sql/core/src/main/resources/META-INF/services/org.apache.spark.status.AppHistoryServerPlugin +++ b/sql/core/src/main/resources/META-INF/services/org.apache.spark.status.AppHistoryServerPlugin @@ -1 +1,2 @@ org.apache.spark.sql.execution.ui.SQLHistoryServerPlugin +org.apache.spark.sql.execution.ui.StreamingQueryHistoryServerPlugin diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala index 1b8d69ffb7521..4b98acd16f6fc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala @@ -31,16 +31,21 @@ import org.apache.spark.util.ListenerBus * Spark listener bus, so that it can receive [[StreamingQueryListener.Event]]s and dispatch them * to StreamingQueryListeners. * - * Note that each bus and its registered listeners are associated with a single SparkSession + * Note 1: Each bus and its registered listeners are associated with a single SparkSession * and StreamingQueryManager. So this bus will dispatch events to registered listeners for only * those queries that were started in the associated SparkSession. + * + * Note 2: To rebuild Structured Streaming UI in SHS, this bus will be registered into + * [[org.apache.spark.scheduler.ReplayListenerBus]]. We check `sparkListenerBus` defined or not to + * determine how to process [[StreamingQueryListener.Event]]. If false, it means this bus is used to + * replay all streaming query event from eventLog. */ -class StreamingQueryListenerBus(sparkListenerBus: LiveListenerBus) +class StreamingQueryListenerBus(sparkListenerBus: Option[LiveListenerBus]) extends SparkListener with ListenerBus[StreamingQueryListener, StreamingQueryListener.Event] { import StreamingQueryListener._ - sparkListenerBus.addToQueue(this, StreamingQueryListenerBus.STREAM_EVENT_QUERY) + sparkListenerBus.foreach(_.addToQueue(this, StreamingQueryListenerBus.STREAM_EVENT_QUERY)) /** * RunIds of active queries whose events are supposed to be forwarded by this ListenerBus @@ -67,11 +72,11 @@ class StreamingQueryListenerBus(sparkListenerBus: LiveListenerBus) event match { case s: QueryStartedEvent => activeQueryRunIds.synchronized { activeQueryRunIds += s.runId } - sparkListenerBus.post(s) + sparkListenerBus.foreach(bus => bus.post(s)) // post to local listeners to trigger callbacks postToAll(s) case _ => - sparkListenerBus.post(event) + sparkListenerBus.foreach(bus => bus.post(event)) } } @@ -95,7 +100,11 @@ class StreamingQueryListenerBus(sparkListenerBus: LiveListenerBus) // synchronously and the ones attached to LiveListenerBus asynchronously. Therefore, // we need to ignore QueryStartedEvent if this method is called within SparkListenerBus // thread - if (!LiveListenerBus.withinListenerThread.value || !e.isInstanceOf[QueryStartedEvent]) { + // + // When loaded by Spark History Server, we should process all event coming from replay + // listener bus. + if (sparkListenerBus.isEmpty || !LiveListenerBus.withinListenerThread.value || + !e.isInstanceOf[QueryStartedEvent]) { postToAll(e) } case _ => @@ -110,7 +119,10 @@ class StreamingQueryListenerBus(sparkListenerBus: LiveListenerBus) listener: StreamingQueryListener, event: StreamingQueryListener.Event): Unit = { def shouldReport(runId: UUID): Boolean = { - activeQueryRunIds.synchronized { activeQueryRunIds.contains(runId) } + // When loaded by Spark History Server, we should process all event coming from replay + // listener bus. + sparkListenerBus.isEmpty || + activeQueryRunIds.synchronized { activeQueryRunIds.contains(runId) } } event match { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/StreamingQueryHistoryServerPlugin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/StreamingQueryHistoryServerPlugin.scala new file mode 100644 index 0000000000000..a127fa59b7433 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/StreamingQueryHistoryServerPlugin.scala @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.ui + +import org.apache.spark.SparkConf +import org.apache.spark.scheduler.SparkListener +import org.apache.spark.sql.execution.streaming.StreamingQueryListenerBus +import org.apache.spark.sql.streaming.ui.{StreamingQueryStatusListener, StreamingQueryTab} +import org.apache.spark.status.{AppHistoryServerPlugin, ElementTrackingStore} +import org.apache.spark.ui.SparkUI + +class StreamingQueryHistoryServerPlugin extends AppHistoryServerPlugin { + + override def createListeners(conf: SparkConf, store: ElementTrackingStore): Seq[SparkListener] = { + val listenerBus = new StreamingQueryListenerBus(None) + listenerBus.addListener(new StreamingQueryStatusListener(conf, store)) + Seq(listenerBus) + } + + override def setupUI(ui: SparkUI): Unit = { + val streamingQueryStatusStore = new StreamingQueryStatusStore(ui.store.store) + if (streamingQueryStatusStore.allQueryUIData.nonEmpty) { + new StreamingQueryTab(streamingQueryStatusStore, ui) + } + } + + override def displayOrder: Int = 1 +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/StreamingQueryStatusStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/StreamingQueryStatusStore.scala new file mode 100644 index 0000000000000..9eb14a6a63063 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/StreamingQueryStatusStore.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.ui + +import java.util.UUID + +import org.apache.spark.sql.streaming.ui.{StreamingQueryData, StreamingQueryProgressWrapper, StreamingQueryUIData} +import org.apache.spark.status.KVUtils +import org.apache.spark.util.kvstore.KVStore + +/** + * Provides a view of a KVStore with methods that make it easy to query Streaming Query state. + * There's no state kept in this class, so it's ok to have multiple instances of it in an + * application. + */ +class StreamingQueryStatusStore(store: KVStore) { + + def allQueryUIData: Seq[StreamingQueryUIData] = { + val view = store.view(classOf[StreamingQueryData]).index("startTimestamp").first(0L) + KVUtils.viewToSeq(view, Int.MaxValue)(_ => true).map(makeUIData) + } + + // visible for test + private[sql] def getQueryProgressData(runId: UUID): Seq[StreamingQueryProgressWrapper] = { + val view = store.view(classOf[StreamingQueryProgressWrapper]) + .index("runId").first(runId.toString).last(runId.toString) + KVUtils.viewToSeq(view, Int.MaxValue)(_ => true) + } + + private def makeUIData(summary: StreamingQueryData): StreamingQueryUIData = { + val runId = summary.runId.toString + val view = store.view(classOf[StreamingQueryProgressWrapper]) + .index("runId").first(runId).last(runId) + val recentProgress = KVUtils.viewToSeq(view, Int.MaxValue)(_ => true) + .map(_.progress).sortBy(_.timestamp).toArray + StreamingQueryUIData(summary, recentProgress) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala index 89aceacac6007..ea430db9f030f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala @@ -34,7 +34,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.execution.CacheManager import org.apache.spark.sql.execution.streaming.StreamExecution -import org.apache.spark.sql.execution.ui.{SQLAppStatusListener, SQLAppStatusStore, SQLTab} +import org.apache.spark.sql.execution.ui.{SQLAppStatusListener, SQLAppStatusStore, SQLTab, StreamingQueryStatusStore} import org.apache.spark.sql.internal.StaticSQLConf._ import org.apache.spark.sql.streaming.ui.{StreamingQueryStatusListener, StreamingQueryTab} import org.apache.spark.status.ElementTrackingStore @@ -111,9 +111,9 @@ private[sql] class SharedState( lazy val streamingQueryStatusListener: Option[StreamingQueryStatusListener] = { sparkContext.ui.flatMap { ui => if (conf.get(STREAMING_UI_ENABLED)) { - val statusListener = new StreamingQueryStatusListener(conf) - new StreamingQueryTab(statusListener, ui) - Some(statusListener) + val kvStore = sparkContext.statusStore.store.asInstanceOf[ElementTrackingStore] + new StreamingQueryTab(new StreamingQueryStatusStore(kvStore), ui) + Some(new StreamingQueryStatusListener(conf, kvStore)) } else { None } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala index ffdbe9d4e4915..b66037d00919d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala @@ -49,7 +49,8 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo private[sql] val stateStoreCoordinator = StateStoreCoordinatorRef.forDriver(sparkSession.sparkContext.env) - private val listenerBus = new StreamingQueryListenerBus(sparkSession.sparkContext.listenerBus) + private val listenerBus = + new StreamingQueryListenerBus(Some(sparkSession.sparkContext.listenerBus)) @GuardedBy("activeQueriesSharedLock") private val activeQueries = new mutable.HashMap[UUID, StreamingQuery] diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala index b98fdf16eef31..96e498991e1bb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala @@ -40,8 +40,8 @@ private[ui] class StreamingQueryPage(parent: StreamingQueryTab) } private def generateStreamingQueryTable(request: HttpServletRequest): Seq[Node] = { - val (activeQueries, inactiveQueries) = parent.statusListener.allQueryStatus - .partition(_.isActive) + val (activeQueries, inactiveQueries) = + parent.store.allQueryUIData.partition(_.summary.isActive) val content = mutable.ListBuffer[Node]() // show active queries table only if there is at least one active query @@ -176,7 +176,7 @@ class StreamingQueryPagedTable( val streamingQuery = query.streamingUIData val statisticsLink = "%s/%s/statistics?id=%s" .format(SparkUIUtils.prependBaseUri(request, parent.basePath), parent.prefix, - streamingQuery.runId) + streamingQuery.summary.runId) def details(detail: Any): Seq[Node] = { if (isActive) { @@ -194,14 +194,14 @@ class StreamingQueryPagedTable( {UIUtils.getQueryName(streamingQuery)} {UIUtils.getQueryStatus(streamingQuery)} - {streamingQuery.id} - {streamingQuery.runId} - {SparkUIUtils.formatDate(streamingQuery.startTimestamp)} + {streamingQuery.summary.id} + {streamingQuery.summary.runId} + {SparkUIUtils.formatDate(streamingQuery.summary.startTimestamp)} {SparkUIUtils.formatDurationVerbose(query.duration)} {withNoProgress(streamingQuery, {query.avgInput.formatted("%.2f")}, "NaN")} {withNoProgress(streamingQuery, {query.avgProcess.formatted("%.2f")}, "NaN")} {withNoProgress(streamingQuery, {streamingQuery.lastProgress.batchId}, "NaN")} - {details(streamingQuery.exception.getOrElse("-"))} + {details(streamingQuery.summary.exception.getOrElse("-"))} } } @@ -222,32 +222,32 @@ class StreamingQueryDataSource(uiData: Seq[StreamingQueryUIData], sortColumn: St override def sliceData(from: Int, to: Int): Seq[StructuredStreamingRow] = data.slice(from, to) - private def streamingRow(query: StreamingQueryUIData): StructuredStreamingRow = { + private def streamingRow(uiData: StreamingQueryUIData): StructuredStreamingRow = { val duration = if (isActive) { - System.currentTimeMillis() - query.startTimestamp + System.currentTimeMillis() - uiData.summary.startTimestamp } else { - withNoProgress(query, { - val endTimeMs = query.lastProgress.timestamp - parseProgressTimestamp(endTimeMs) - query.startTimestamp + withNoProgress(uiData, { + val endTimeMs = uiData.lastProgress.timestamp + parseProgressTimestamp(endTimeMs) - uiData.summary.startTimestamp }, 0) } - val avgInput = (query.recentProgress.map(p => withNumberInvalid(p.inputRowsPerSecond)).sum / - query.recentProgress.length) + val avgInput = (uiData.recentProgress.map(p => withNumberInvalid(p.inputRowsPerSecond)).sum / + uiData.recentProgress.length) - val avgProcess = (query.recentProgress.map(p => - withNumberInvalid(p.processedRowsPerSecond)).sum / query.recentProgress.length) + val avgProcess = (uiData.recentProgress.map(p => + withNumberInvalid(p.processedRowsPerSecond)).sum / uiData.recentProgress.length) - StructuredStreamingRow(duration, avgInput, avgProcess, query) + StructuredStreamingRow(duration, avgInput, avgProcess, uiData) } private def ordering(sortColumn: String, desc: Boolean): Ordering[StructuredStreamingRow] = { val ordering: Ordering[StructuredStreamingRow] = sortColumn match { - case "Name" => Ordering.by(q => UIUtils.getQueryName(q.streamingUIData)) - case "Status" => Ordering.by(q => UIUtils.getQueryStatus(q.streamingUIData)) - case "ID" => Ordering.by(_.streamingUIData.id) - case "Run ID" => Ordering.by(_.streamingUIData.runId) - case "Start Time" => Ordering.by(_.streamingUIData.startTimestamp) + case "Name" => Ordering.by(row => UIUtils.getQueryName(row.streamingUIData)) + case "Status" => Ordering.by(row => UIUtils.getQueryStatus(row.streamingUIData)) + case "ID" => Ordering.by(_.streamingUIData.summary.id) + case "Run ID" => Ordering.by(_.streamingUIData.summary.runId) + case "Start Time" => Ordering.by(_.streamingUIData.summary.startTimestamp) case "Duration" => Ordering.by(_.duration) case "Avg Input /sec" => Ordering.by(_.avgInput) case "Avg Process /sec" => Ordering.by(_.avgProcess) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala index 24709ba470cde..97691d9d7e827 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala @@ -58,8 +58,8 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab) val parameterId = request.getParameter("id") require(parameterId != null && parameterId.nonEmpty, "Missing id parameter") - val query = parent.statusListener.allQueryStatus.find { case q => - q.runId.equals(UUID.fromString(parameterId)) + val query = parent.store.allQueryUIData.find { uiData => + uiData.summary.runId.equals(UUID.fromString(parameterId)) }.getOrElse(throw new IllegalArgumentException(s"Failed to find streaming query $parameterId")) val resources = generateLoadResources(request) @@ -109,34 +109,35 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab) } - def generateBasicInfo(query: StreamingQueryUIData): Seq[Node] = { - val duration = if (query.isActive) { - SparkUIUtils.formatDurationVerbose(System.currentTimeMillis() - query.startTimestamp) + def generateBasicInfo(uiData: StreamingQueryUIData): Seq[Node] = { + val duration = if (uiData.summary.isActive) { + val durationMs = System.currentTimeMillis() - uiData.summary.startTimestamp + SparkUIUtils.formatDurationVerbose(durationMs) } else { - withNoProgress(query, { - val end = query.lastProgress.timestamp - val start = query.recentProgress.head.timestamp + withNoProgress(uiData, { + val end = uiData.lastProgress.timestamp + val start = uiData.recentProgress.head.timestamp SparkUIUtils.formatDurationVerbose( parseProgressTimestamp(end) - parseProgressTimestamp(start)) }, "-") } - val name = UIUtils.getQueryName(query) - val numBatches = withNoProgress(query, { query.lastProgress.batchId + 1L }, 0) + val name = UIUtils.getQueryName(uiData) + val numBatches = withNoProgress(uiData, { uiData.lastProgress.batchId + 1L }, 0)
      Running batches for {duration} since - {SparkUIUtils.formatDate(query.startTimestamp)} + {SparkUIUtils.formatDate(uiData.summary.startTimestamp)} ({numBatches} completed batches)

      Name: {name}
      -
      Id: {query.id}
      -
      RunId: {query.runId}
      +
      Id: {uiData.summary.id}
      +
      RunId: {uiData.summary.runId}

      } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListener.scala index e331083b30024..fdd3754344108 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListener.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListener.scala @@ -20,102 +20,144 @@ package org.apache.spark.sql.streaming.ui import java.util.UUID import java.util.concurrent.ConcurrentHashMap -import scala.collection.JavaConverters._ import scala.collection.mutable +import com.fasterxml.jackson.annotation.JsonIgnore + import org.apache.spark.SparkConf import org.apache.spark.sql.internal.StaticSQLConf import org.apache.spark.sql.streaming.{StreamingQueryListener, StreamingQueryProgress} +import org.apache.spark.sql.streaming.ui.StreamingQueryProgressWrapper._ import org.apache.spark.sql.streaming.ui.UIUtils.parseProgressTimestamp +import org.apache.spark.status.{ElementTrackingStore, KVUtils} +import org.apache.spark.status.KVUtils.KVIndexParam +import org.apache.spark.util.kvstore.KVIndex /** * A customized StreamingQueryListener used in structured streaming UI, which contains all * UI data for both active and inactive query. - * TODO: Add support for history server. */ -private[sql] class StreamingQueryStatusListener(conf: SparkConf) extends StreamingQueryListener { - - /** - * We use runId as the key here instead of id in active query status map, - * because the runId is unique for every started query, even it its a restart. - */ - private[ui] val activeQueryStatus = new ConcurrentHashMap[UUID, StreamingQueryUIData]() - private[ui] val inactiveQueryStatus = new mutable.Queue[StreamingQueryUIData]() +private[sql] class StreamingQueryStatusListener( + conf: SparkConf, + store: ElementTrackingStore) extends StreamingQueryListener { private val streamingProgressRetention = conf.get(StaticSQLConf.STREAMING_UI_RETAINED_PROGRESS_UPDATES) private val inactiveQueryStatusRetention = conf.get(StaticSQLConf.STREAMING_UI_RETAINED_QUERIES) + store.addTrigger(classOf[StreamingQueryData], inactiveQueryStatusRetention) { count => + cleanupInactiveQueries(count) + } + + // Events from the same query run will never be processed concurrently, so it's safe to + // access `progressIds` without any protection. + private val queryToProgress = new ConcurrentHashMap[UUID, mutable.Queue[String]]() + + private def cleanupInactiveQueries(count: Long): Unit = { + val view = store.view(classOf[StreamingQueryData]).index("active").first(false).last(false) + val inactiveQueries = KVUtils.viewToSeq(view, Int.MaxValue)(_ => true) + val numInactiveQueries = inactiveQueries.size + if (numInactiveQueries <= inactiveQueryStatusRetention) { + return + } + val toDelete = inactiveQueries.sortBy(_.endTimestamp.get) + .take(numInactiveQueries - inactiveQueryStatusRetention) + val runIds = toDelete.map { e => + store.delete(e.getClass, e.runId) + e.runId.toString + } + // Delete wrappers in one pass, as deleting them for each summary is slow + store.removeAllByIndexValues(classOf[StreamingQueryProgressWrapper], "runId", runIds) + } + override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = { val startTimestamp = parseProgressTimestamp(event.timestamp) - activeQueryStatus.putIfAbsent(event.runId, - new StreamingQueryUIData(event.name, event.id, event.runId, startTimestamp)) + store.write(new StreamingQueryData( + event.name, + event.id, + event.runId, + isActive = true, + None, + startTimestamp + ), checkTriggers = true) } override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = { - val batchTimestamp = parseProgressTimestamp(event.progress.timestamp) - val queryStatus = activeQueryStatus.getOrDefault( - event.progress.runId, - new StreamingQueryUIData(event.progress.name, event.progress.id, event.progress.runId, - batchTimestamp)) - queryStatus.updateProcess(event.progress, streamingProgressRetention) - } - - override def onQueryTerminated( - event: StreamingQueryListener.QueryTerminatedEvent): Unit = synchronized { - val queryStatus = activeQueryStatus.remove(event.runId) - if (queryStatus != null) { - queryStatus.queryTerminated(event) - inactiveQueryStatus += queryStatus - while (inactiveQueryStatus.length >= inactiveQueryStatusRetention) { - inactiveQueryStatus.dequeue() - } + val runId = event.progress.runId + val batchId = event.progress.batchId + val timestamp = event.progress.timestamp + if (!queryToProgress.containsKey(runId)) { + queryToProgress.put(runId, mutable.Queue.empty[String]) + } + val progressIds = queryToProgress.get(runId) + progressIds.enqueue(getUniqueId(runId, batchId, timestamp)) + store.write(new StreamingQueryProgressWrapper(event.progress)) + while (progressIds.length > streamingProgressRetention) { + val uniqueId = progressIds.dequeue + store.delete(classOf[StreamingQueryProgressWrapper], uniqueId) } } - def allQueryStatus: Seq[StreamingQueryUIData] = synchronized { - activeQueryStatus.values().asScala.toSeq ++ inactiveQueryStatus + override def onQueryTerminated( + event: StreamingQueryListener.QueryTerminatedEvent): Unit = { + val querySummary = store.read(classOf[StreamingQueryData], event.runId) + val curTime = System.currentTimeMillis() + store.write(new StreamingQueryData( + querySummary.name, + querySummary.id, + querySummary.runId, + isActive = false, + querySummary.exception, + querySummary.startTimestamp, + Some(curTime) + ), checkTriggers = true) + queryToProgress.remove(event.runId) } } +private[sql] class StreamingQueryData( + val name: String, + val id: UUID, + @KVIndexParam val runId: UUID, + @KVIndexParam("active") val isActive: Boolean, + val exception: Option[String], + @KVIndexParam("startTimestamp") val startTimestamp: Long, + val endTimestamp: Option[Long] = None) + /** * This class contains all message related to UI display, each instance corresponds to a single * [[org.apache.spark.sql.streaming.StreamingQuery]]. */ -private[ui] class StreamingQueryUIData( - val name: String, - val id: UUID, - val runId: UUID, - val startTimestamp: Long) { - - /** Holds the most recent query progress updates. */ - private val progressBuffer = new mutable.Queue[StreamingQueryProgress]() - - private var _isActive = true - private var _exception: Option[String] = None - - def isActive: Boolean = synchronized { _isActive } - - def exception: Option[String] = synchronized { _exception } - - def queryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = synchronized { - _isActive = false - _exception = event.exception - } - - def updateProcess( - newProgress: StreamingQueryProgress, retentionNum: Int): Unit = progressBuffer.synchronized { - progressBuffer += newProgress - while (progressBuffer.length >= retentionNum) { - progressBuffer.dequeue() +private[sql] case class StreamingQueryUIData( + summary: StreamingQueryData, + recentProgress: Array[StreamingQueryProgress]) { + + def lastProgress: StreamingQueryProgress = { + if (recentProgress.nonEmpty) { + recentProgress.last + } else { + null } } +} - def recentProgress: Array[StreamingQueryProgress] = progressBuffer.synchronized { - progressBuffer.toArray - } +private[sql] class StreamingQueryProgressWrapper(val progress: StreamingQueryProgress) { + @JsonIgnore @KVIndex + private val uniqueId: String = getUniqueId(progress.runId, progress.batchId, progress.timestamp) - def lastProgress: StreamingQueryProgress = progressBuffer.synchronized { - progressBuffer.lastOption.orNull + @JsonIgnore @KVIndex("runId") + private def runIdIndex: String = progress.runId.toString +} + +private[sql] object StreamingQueryProgressWrapper { + /** + * Adding `timestamp` into unique id to support reporting `empty` query progress + * in which no data comes but with the same batchId. + */ + def getUniqueId( + runId: UUID, + batchId: Long, + timestamp: String): String = { + s"${runId}_${batchId}_$timestamp" } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryTab.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryTab.scala index bb097ffc06912..65cad8f06cc1c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryTab.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryTab.scala @@ -17,10 +17,11 @@ package org.apache.spark.sql.streaming.ui import org.apache.spark.internal.Logging +import org.apache.spark.sql.execution.ui.StreamingQueryStatusStore import org.apache.spark.ui.{SparkUI, SparkUITab} private[sql] class StreamingQueryTab( - val statusListener: StreamingQueryStatusListener, + val store: StreamingQueryStatusStore, sparkUI: SparkUI) extends SparkUITab(sparkUI, "StreamingQuery") with Logging { override val name = "Structured Streaming" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/UIUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/UIUtils.scala index 1f7e65dede170..88a110fa9a329 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/UIUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/UIUtils.scala @@ -46,19 +46,19 @@ private[ui] object UIUtils { } } - def getQueryName(query: StreamingQueryUIData): String = { - if (query.name == null || query.name.isEmpty) { + def getQueryName(uiData: StreamingQueryUIData): String = { + if (uiData.summary.name == null || uiData.summary.name.isEmpty) { "" } else { - query.name + uiData.summary.name } } - def getQueryStatus(query: StreamingQueryUIData): String = { - if (query.isActive) { + def getQueryStatus(uiData: StreamingQueryUIData): String = { + if (uiData.summary.isActive) { "RUNNING" } else { - query.exception.map(_ => "FAILED").getOrElse("FINISHED") + uiData.summary.exception.map(_ => "FAILED").getOrElse("FINISHED") } } diff --git a/sql/core/src/test/resources/spark-events/local-1596020211915 b/sql/core/src/test/resources/spark-events/local-1596020211915 new file mode 100644 index 0000000000000..ff34bbc16ef3a --- /dev/null +++ b/sql/core/src/test/resources/spark-events/local-1596020211915 @@ -0,0 +1,160 @@ +{"Event":"SparkListenerLogStart","Spark Version":"3.1.0-SNAPSHOT"} +{"Event":"SparkListenerResourceProfileAdded","Resource Profile Id":0,"Executor Resource Requests":{"cores":{"Resource Name":"cores","Amount":1,"Discovery Script":"","Vendor":""},"memory":{"Resource Name":"memory","Amount":1024,"Discovery Script":"","Vendor":""}},"Task Resource Requests":{"cpus":{"Resource Name":"cpus","Amount":1.0}}} +{"Event":"SparkListenerExecutorAdded","Timestamp":1596020212090,"Executor ID":"driver","Executor Info":{"Host":"iZbp19vpr16ix621sdw476Z","Total Cores":4,"Log Urls":{},"Attributes":{},"Resources":{},"Resource Profile Id":0}} +{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Port":39845},"Maximum Memory":384093388,"Timestamp":1596020212109,"Maximum Onheap Memory":384093388,"Maximum Offheap Memory":0} +{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{"Java Home":"/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.252.b09-2.el7_8.x86_64/jre","Java Version":"1.8.0_252 (Oracle Corporation)","Scala Version":"version 2.12.10"},"Spark Properties":{"spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.driver.port":"46309","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","spark.app.name":"StructuredKafkaWordCount","spark.scheduler.mode":"FIFO","spark.submit.pyFiles":"","spark.executor.id":"driver","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"},"Hadoop Properties":{"yarn.resourcemanager.amlauncher.thread-count":"50","yarn.sharedcache.enabled":"false","fs.s3a.connection.maximum":"15","fs.s3a.impl":"org.apache.hadoop.fs.s3a.S3AFileSystem","yarn.app.mapreduce.am.scheduler.heartbeat.interval-ms":"1000","hadoop.security.kms.client.timeout":"60","hadoop.http.authentication.kerberos.principal":"HTTP/_HOST@LOCALHOST","mapreduce.framework.name":"local","yarn.sharedcache.uploader.server.thread-count":"50","yarn.nodemanager.linux-container-executor.nonsecure-mode.user-pattern":"^[_.A-Za-z0-9][-@_.A-Za-z0-9]{0,255}?[$]?$","tfile.fs.output.buffer.size":"262144","yarn.app.mapreduce.am.job.task.listener.thread-count":"30","hadoop.security.groups.cache.background.reload.threads":"3","yarn.resourcemanager.webapp.cross-origin.enabled":"false","fs.AbstractFileSystem.ftp.impl":"org.apache.hadoop.fs.ftp.FtpFs","fs.s3.block.size":"67108864","hadoop.registry.secure":"false","hadoop.shell.safely.delete.limit.num.files":"100","dfs.bytes-per-checksum":"512","fs.s3.buffer.dir":"${hadoop.tmp.dir}/s3","mapreduce.job.acl-view-job":" ","mapreduce.jobhistory.loadedjobs.cache.size":"5","mapreduce.input.fileinputformat.split.minsize":"0","yarn.resourcemanager.container.liveness-monitor.interval-ms":"600000","yarn.resourcemanager.client.thread-count":"50","io.seqfile.compress.blocksize":"1000000","yarn.sharedcache.checksum.algo.impl":"org.apache.hadoop.yarn.sharedcache.ChecksumSHA256Impl","yarn.nodemanager.amrmproxy.interceptor-class.pipeline":"org.apache.hadoop.yarn.server.nodemanager.amrmproxy.DefaultRequestInterceptor","yarn.timeline-service.entity-group-fs-store.leveldb-cache-read-cache-size":"10485760","mapreduce.reduce.shuffle.fetch.retry.interval-ms":"1000","mapreduce.task.profile.maps":"0-2","yarn.scheduler.include-port-in-node-name":"false","yarn.nodemanager.admin-env":"MALLOC_ARENA_MAX=$MALLOC_ARENA_MAX","yarn.resourcemanager.node-removal-untracked.timeout-ms":"60000","mapreduce.am.max-attempts":"2","hadoop.security.kms.client.failover.sleep.base.millis":"100","mapreduce.jobhistory.webapp.https.address":"0.0.0.0:19890","yarn.node-labels.fs-store.impl.class":"org.apache.hadoop.yarn.nodelabels.FileSystemNodeLabelsStore","fs.trash.checkpoint.interval":"0","mapreduce.job.map.output.collector.class":"org.apache.hadoop.mapred.MapTask$MapOutputBuffer","yarn.resourcemanager.node-ip-cache.expiry-interval-secs":"-1","hadoop.http.authentication.signature.secret.file":"*********(redacted)","hadoop.jetty.logs.serve.aliases":"true","yarn.timeline-service.handler-thread-count":"10","yarn.resourcemanager.max-completed-applications":"10000","yarn.resourcemanager.system-metrics-publisher.enabled":"false","yarn.sharedcache.webapp.address":"0.0.0.0:8788","yarn.resourcemanager.delegation.token.renew-interval":"*********(redacted)","yarn.sharedcache.nm.uploader.replication.factor":"10","hadoop.security.groups.negative-cache.secs":"30","yarn.app.mapreduce.task.container.log.backups":"0","mapreduce.reduce.skip.proc-count.auto-incr":"true","hadoop.security.group.mapping.ldap.posix.attr.gid.name":"gidNumber","ipc.client.fallback-to-simple-auth-allowed":"false","yarn.client.failover-proxy-provider":"org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider","yarn.timeline-service.http-authentication.simple.anonymous.allowed":"true","ha.health-monitor.check-interval.ms":"1000","yarn.acl.reservation-enable":"false","yarn.resourcemanager.store.class":"org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore","yarn.app.mapreduce.am.hard-kill-timeout-ms":"10000","yarn.nodemanager.container-metrics.enable":"true","yarn.timeline-service.client.fd-clean-interval-secs":"60","yarn.nodemanager.docker-container-executor.exec-name":"/usr/bin/docker","yarn.resourcemanager.nodemanagers.heartbeat-interval-ms":"1000","mapred.child.java.opts":"-Xmx200m","hadoop.common.configuration.version":"0.23.0","yarn.nodemanager.remote-app-log-dir-suffix":"logs","yarn.nodemanager.windows-container.cpu-limit.enabled":"false","yarn.nodemanager.runtime.linux.docker.privileged-containers.allowed":"false","file.blocksize":"67108864","hadoop.registry.zk.retry.ceiling.ms":"60000","yarn.sharedcache.store.in-memory.initial-delay-mins":"10","mapreduce.jobhistory.principal":"jhs/_HOST@REALM.TLD","mapreduce.map.skip.proc-count.auto-incr":"true","mapreduce.task.profile.reduces":"0-2","yarn.timeline-service.webapp.https.address":"${yarn.timeline-service.hostname}:8190","yarn.resourcemanager.scheduler.address":"${yarn.resourcemanager.hostname}:8030","yarn.node-labels.enabled":"false","yarn.resourcemanager.webapp.ui-actions.enabled":"true","mapreduce.task.timeout":"600000","yarn.sharedcache.client-server.thread-count":"50","hadoop.security.crypto.cipher.suite":"AES/CTR/NoPadding","yarn.resourcemanager.connect.max-wait.ms":"900000","fs.defaultFS":"file:///","yarn.minicluster.use-rpc":"false","fs.har.impl.disable.cache":"true","io.compression.codec.bzip2.library":"system-native","mapreduce.shuffle.connection-keep-alive.timeout":"5","yarn.resourcemanager.webapp.https.address":"${yarn.resourcemanager.hostname}:8090","mapreduce.jobhistory.address":"0.0.0.0:10020","yarn.resourcemanager.nm-tokens.master-key-rolling-interval-secs":"*********(redacted)","yarn.is.minicluster":"false","yarn.nodemanager.address":"${yarn.nodemanager.hostname}:0","fs.AbstractFileSystem.s3a.impl":"org.apache.hadoop.fs.s3a.S3A","mapreduce.task.combine.progress.records":"10000","yarn.resourcemanager.am.max-attempts":"2","yarn.nodemanager.linux-container-executor.cgroups.hierarchy":"/hadoop-yarn","ipc.server.log.slow.rpc":"false","yarn.resourcemanager.node-labels.provider.fetch-interval-ms":"1800000","yarn.nodemanager.webapp.cross-origin.enabled":"false","yarn.app.mapreduce.am.job.committer.cancel-timeout":"60000","ftp.bytes-per-checksum":"512","yarn.nodemanager.resource.memory-mb":"-1","fs.s3a.fast.upload.active.blocks":"4","mapreduce.jobhistory.joblist.cache.size":"20000","fs.ftp.host":"0.0.0.0","yarn.resourcemanager.fs.state-store.num-retries":"0","yarn.resourcemanager.nodemanager-connect-retries":"10","hadoop.security.kms.client.encrypted.key.cache.low-watermark":"0.3f","yarn.timeline-service.client.max-retries":"30","dfs.ha.fencing.ssh.connect-timeout":"30000","yarn.log-aggregation-enable":"false","mapreduce.reduce.markreset.buffer.percent":"0.0","fs.AbstractFileSystem.viewfs.impl":"org.apache.hadoop.fs.viewfs.ViewFs","mapreduce.task.io.sort.factor":"10","yarn.nodemanager.amrmproxy.client.thread-count":"25","ha.failover-controller.new-active.rpc-timeout.ms":"60000","yarn.nodemanager.container-localizer.java.opts":"-Xmx256m","mapreduce.jobhistory.datestring.cache.size":"200000","mapreduce.job.acl-modify-job":" ","yarn.nodemanager.windows-container.memory-limit.enabled":"false","yarn.timeline-service.webapp.address":"${yarn.timeline-service.hostname}:8188","yarn.app.mapreduce.am.job.committer.commit-window":"10000","yarn.nodemanager.container-manager.thread-count":"20","yarn.minicluster.fixed.ports":"false","yarn.cluster.max-application-priority":"0","yarn.timeline-service.ttl-enable":"true","mapreduce.jobhistory.recovery.store.fs.uri":"${hadoop.tmp.dir}/mapred/history/recoverystore","hadoop.caller.context.signature.max.size":"40","ha.zookeeper.session-timeout.ms":"10000","tfile.io.chunk.size":"1048576","mapreduce.job.speculative.slowtaskthreshold":"1.0","io.serializations":"org.apache.hadoop.io.serializer.WritableSerialization, org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization, org.apache.hadoop.io.serializer.avro.AvroReflectSerialization","hadoop.security.kms.client.failover.sleep.max.millis":"2000","hadoop.security.group.mapping.ldap.directory.search.timeout":"10000","fs.swift.impl":"org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem","yarn.nodemanager.local-cache.max-files-per-directory":"8192","hadoop.http.cross-origin.enabled":"false","mapreduce.map.sort.spill.percent":"0.80","yarn.timeline-service.entity-group-fs-store.scan-interval-seconds":"60","yarn.timeline-service.client.best-effort":"false","yarn.resourcemanager.webapp.delegation-token-auth-filter.enabled":"*********(redacted)","hadoop.security.group.mapping.ldap.posix.attr.uid.name":"uidNumber","fs.AbstractFileSystem.swebhdfs.impl":"org.apache.hadoop.fs.SWebHdfs","mapreduce.ifile.readahead":"true","yarn.timeline-service.leveldb-timeline-store.ttl-interval-ms":"300000","hadoop.security.kms.client.encrypted.key.cache.num.refill.threads":"2","yarn.resourcemanager.scheduler.class":"org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler","yarn.app.mapreduce.am.command-opts":"-Xmx1024m","mapreduce.cluster.local.dir":"${hadoop.tmp.dir}/mapred/local","io.mapfile.bloom.error.rate":"0.005","yarn.nodemanager.runtime.linux.allowed-runtimes":"default","yarn.sharedcache.store.class":"org.apache.hadoop.yarn.server.sharedcachemanager.store.InMemorySCMStore","ha.failover-controller.graceful-fence.rpc-timeout.ms":"5000","ftp.replication":"3","hadoop.security.uid.cache.secs":"14400","mapreduce.job.maxtaskfailures.per.tracker":"3","io.skip.checksum.errors":"false","yarn.app.mapreduce.client-am.ipc.max-retries-on-timeouts":"3","fs.s3a.connection.timeout":"200000","mapreduce.job.max.split.locations":"10","hadoop.registry.zk.session.timeout.ms":"60000","mapreduce.jvm.system-properties-to-log":"os.name,os.version,java.home,java.runtime.version,java.vendor,java.version,java.vm.name,java.class.path,java.io.tmpdir,user.dir,user.name","yarn.timeline-service.entity-group-fs-store.active-dir":"/tmp/entity-file-history/active","mapreduce.shuffle.transfer.buffer.size":"131072","yarn.timeline-service.client.retry-interval-ms":"1000","yarn.http.policy":"HTTP_ONLY","fs.s3a.socket.send.buffer":"8192","yarn.sharedcache.uploader.server.address":"0.0.0.0:8046","hadoop.http.authentication.token.validity":"*********(redacted)","mapreduce.shuffle.max.connections":"0","yarn.minicluster.yarn.nodemanager.resource.memory-mb":"4096","mapreduce.job.emit-timeline-data":"false","yarn.nodemanager.resource.system-reserved-memory-mb":"-1","hadoop.kerberos.min.seconds.before.relogin":"60","mapreduce.jobhistory.move.thread-count":"3","yarn.resourcemanager.admin.client.thread-count":"1","yarn.dispatcher.drain-events.timeout":"300000","fs.s3a.buffer.dir":"${hadoop.tmp.dir}/s3a","hadoop.ssl.enabled.protocols":"TLSv1,SSLv2Hello,TLSv1.1,TLSv1.2","mapreduce.jobhistory.admin.address":"0.0.0.0:10033","yarn.log-aggregation-status.time-out.ms":"600000","mapreduce.shuffle.port":"13562","yarn.resourcemanager.max-log-aggregation-diagnostics-in-memory":"10","yarn.nodemanager.health-checker.interval-ms":"600000","ftp.blocksize":"67108864","yarn.nodemanager.log-container-debug-info.enabled":"false","yarn.client.max-cached-nodemanagers-proxies":"0","yarn.nodemanager.linux-container-executor.cgroups.delete-delay-ms":"20","yarn.nodemanager.delete.debug-delay-sec":"0","yarn.nodemanager.pmem-check-enabled":"true","yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage":"90.0","mapreduce.app-submission.cross-platform":"false","yarn.resourcemanager.work-preserving-recovery.scheduling-wait-ms":"10000","hadoop.security.groups.cache.secs":"300","yarn.resourcemanager.zk-retry-interval-ms":"1000","ipc.maximum.data.length":"67108864","mapreduce.shuffle.max.threads":"0","hadoop.security.authorization":"false","mapreduce.job.complete.cancel.delegation.tokens":"*********(redacted)","fs.s3a.paging.maximum":"5000","nfs.exports.allowed.hosts":"* rw","mapreduce.jobhistory.http.policy":"HTTP_ONLY","yarn.sharedcache.store.in-memory.check-period-mins":"720","s3native.replication":"3","hadoop.security.group.mapping.ldap.ssl":"false","yarn.client.application-client-protocol.poll-interval-ms":"200","ha.zookeeper.parent-znode":"/hadoop-ha","yarn.nodemanager.log-aggregation.policy.class":"org.apache.hadoop.yarn.server.nodemanager.containermanager.logaggregation.AllContainerLogAggregationPolicy","mapreduce.reduce.shuffle.merge.percent":"0.66","hadoop.security.group.mapping.ldap.search.filter.group":"(objectClass=group)","yarn.nodemanager.resourcemanager.minimum.version":"NONE","mapreduce.job.speculative.speculative-cap-running-tasks":"0.1","yarn.admin.acl":"*","yarn.nodemanager.recovery.supervised":"false","yarn.sharedcache.admin.thread-count":"1","yarn.resourcemanager.ha.automatic-failover.enabled":"true","mapreduce.reduce.skip.maxgroups":"0","mapreduce.reduce.shuffle.connect.timeout":"180000","yarn.resourcemanager.address":"${yarn.resourcemanager.hostname}:8032","ipc.client.ping":"true","mapreduce.task.local-fs.write-limit.bytes":"-1","fs.adl.oauth2.access.token.provider.type":"*********(redacted)","mapreduce.shuffle.ssl.file.buffer.size":"65536","yarn.resourcemanager.ha.automatic-failover.embedded":"true","hadoop.ssl.enabled":"false","fs.s3a.multipart.purge":"false","mapreduce.job.end-notification.max.attempts":"5","mapreduce.output.fileoutputformat.compress.codec":"org.apache.hadoop.io.compress.DefaultCodec","yarn.nodemanager.container-monitor.procfs-tree.smaps-based-rss.enabled":"false","ha.health-monitor.connect-retry-interval.ms":"1000","yarn.nodemanager.keytab":"/etc/krb5.keytab","mapreduce.jobhistory.keytab":"/etc/security/keytab/jhs.service.keytab","fs.s3a.threads.max":"10","mapreduce.reduce.shuffle.input.buffer.percent":"0.70","mapreduce.cluster.temp.dir":"${hadoop.tmp.dir}/mapred/temp","s3.replication":"3","yarn.nodemanager.node-labels.resync-interval-ms":"120000","hadoop.tmp.dir":"/tmp/hadoop-${user.name}","mapreduce.job.maps":"2","mapreduce.job.end-notification.max.retry.interval":"5000","yarn.log-aggregation.retain-check-interval-seconds":"-1","yarn.resourcemanager.resource-tracker.client.thread-count":"50","yarn.timeline-service.leveldb-timeline-store.start-time-read-cache-size":"10000","yarn.resourcemanager.ha.automatic-failover.zk-base-path":"/yarn-leader-election","io.seqfile.local.dir":"${hadoop.tmp.dir}/io/local","mapreduce.client.submit.file.replication":"10","mapreduce.jobhistory.minicluster.fixed.ports":"false","fs.s3a.multipart.threshold":"2147483647","mapreduce.jobhistory.done-dir":"${yarn.app.mapreduce.am.staging-dir}/history/done","yarn.resourcemanager.zk-acl":"world:anyone:rwcda","ipc.client.idlethreshold":"4000","yarn.nodemanager.linux-container-executor.cgroups.strict-resource-usage":"false","mapreduce.reduce.input.buffer.percent":"0.0","yarn.nodemanager.amrmproxy.enable":"false","fs.ftp.host.port":"21","ipc.ping.interval":"60000","yarn.resourcemanager.history-writer.multi-threaded-dispatcher.pool-size":"10","yarn.resourcemanager.admin.address":"${yarn.resourcemanager.hostname}:8033","file.client-write-packet-size":"65536","ipc.client.kill.max":"10","mapreduce.reduce.speculative":"true","mapreduce.local.clientfactory.class.name":"org.apache.hadoop.mapred.LocalClientFactory","mapreduce.job.reducer.unconditional-preempt.delay.sec":"300","yarn.nodemanager.disk-health-checker.interval-ms":"120000","yarn.nodemanager.log.deletion-threads-count":"4","ipc.client.connection.maxidletime":"10000","mapreduce.task.io.sort.mb":"100","yarn.nodemanager.localizer.client.thread-count":"5","yarn.sharedcache.admin.address":"0.0.0.0:8047","yarn.nodemanager.localizer.cache.cleanup.interval-ms":"600000","hadoop.security.crypto.codec.classes.aes.ctr.nopadding":"org.apache.hadoop.crypto.OpensslAesCtrCryptoCodec, org.apache.hadoop.crypto.JceAesCtrCryptoCodec","fs.s3a.connection.ssl.enabled":"true","yarn.nodemanager.process-kill-wait.ms":"2000","mapreduce.job.hdfs-servers":"${fs.defaultFS}","hadoop.workaround.non.threadsafe.getpwuid":"true","fs.df.interval":"60000","fs.s3.sleepTimeSeconds":"10","fs.s3a.multiobjectdelete.enable":"true","yarn.sharedcache.cleaner.resource-sleep-ms":"0","yarn.nodemanager.disk-health-checker.min-healthy-disks":"0.25","hadoop.shell.missing.defaultFs.warning":"false","io.file.buffer.size":"65536","hadoop.security.group.mapping.ldap.search.attr.member":"member","hadoop.security.random.device.file.path":"/dev/urandom","hadoop.security.sensitive-config-keys":"*********(redacted)","hadoop.rpc.socket.factory.class.default":"org.apache.hadoop.net.StandardSocketFactory","yarn.intermediate-data-encryption.enable":"false","yarn.resourcemanager.connect.retry-interval.ms":"30000","yarn.scheduler.minimum-allocation-mb":"1024","yarn.app.mapreduce.am.staging-dir":"/tmp/hadoop-yarn/staging","mapreduce.reduce.shuffle.read.timeout":"180000","hadoop.http.cross-origin.max-age":"1800","fs.s3a.connection.establish.timeout":"5000","mapreduce.job.running.map.limit":"0","yarn.minicluster.control-resource-monitoring":"false","hadoop.ssl.require.client.cert":"false","hadoop.kerberos.kinit.command":"kinit","mapreduce.reduce.log.level":"INFO","hadoop.security.dns.log-slow-lookups.threshold.ms":"1000","mapreduce.job.ubertask.enable":"false","hadoop.caller.context.enabled":"false","yarn.nodemanager.vmem-pmem-ratio":"2.1","hadoop.rpc.protection":"authentication","ha.health-monitor.rpc-timeout.ms":"45000","s3native.stream-buffer-size":"4096","yarn.nodemanager.remote-app-log-dir":"/tmp/logs","yarn.nodemanager.resource.pcores-vcores-multiplier":"1.0","yarn.app.mapreduce.am.containerlauncher.threadpool-initial-size":"10","fs.s3n.multipart.uploads.enabled":"false","hadoop.security.crypto.buffer.size":"8192","yarn.nodemanager.node-labels.provider.fetch-interval-ms":"600000","mapreduce.jobhistory.recovery.store.leveldb.path":"${hadoop.tmp.dir}/mapred/history/recoverystore","yarn.client.failover-retries-on-socket-timeouts":"0","hadoop.security.instrumentation.requires.admin":"false","yarn.nodemanager.delete.thread-count":"4","mapreduce.job.finish-when-all-reducers-done":"false","hadoop.registry.jaas.context":"Client","yarn.timeline-service.leveldb-timeline-store.path":"${hadoop.tmp.dir}/yarn/timeline","s3.blocksize":"67108864","io.map.index.interval":"128","mapreduce.job.counters.max":"120","yarn.timeline-service.store-class":"org.apache.hadoop.yarn.server.timeline.LeveldbTimelineStore","mapreduce.jobhistory.move.interval-ms":"180000","yarn.nodemanager.localizer.fetch.thread-count":"4","yarn.resourcemanager.scheduler.client.thread-count":"50","hadoop.ssl.hostname.verifier":"DEFAULT","yarn.timeline-service.leveldb-state-store.path":"${hadoop.tmp.dir}/yarn/timeline","mapreduce.job.classloader":"false","mapreduce.task.profile.map.params":"${mapreduce.task.profile.params}","ipc.client.connect.timeout":"20000","s3.stream-buffer-size":"4096","yarn.nm.liveness-monitor.expiry-interval-ms":"600000","yarn.resourcemanager.reservation-system.planfollower.time-step":"1000","s3native.bytes-per-checksum":"512","mapreduce.jobtracker.address":"local","yarn.nodemanager.recovery.enabled":"false","mapreduce.job.end-notification.retry.interval":"1000","fs.du.interval":"600000","hadoop.security.group.mapping.ldap.read.timeout.ms":"60000","hadoop.security.groups.cache.warn.after.ms":"5000","file.bytes-per-checksum":"512","yarn.node-labels.fs-store.retry-policy-spec":"2000, 500","hadoop.security.groups.cache.background.reload":"false","net.topology.script.number.args":"100","mapreduce.task.merge.progress.records":"10000","yarn.nodemanager.localizer.address":"${yarn.nodemanager.hostname}:8040","yarn.timeline-service.keytab":"/etc/krb5.keytab","mapreduce.reduce.shuffle.fetch.retry.timeout-ms":"30000","yarn.resourcemanager.rm.container-allocation.expiry-interval-ms":"600000","mapreduce.fileoutputcommitter.algorithm.version":"1","yarn.resourcemanager.work-preserving-recovery.enabled":"true","mapreduce.map.skip.maxrecords":"0","yarn.sharedcache.root-dir":"/sharedcache","hadoop.http.authentication.type":"simple","mapreduce.task.userlog.limit.kb":"0","yarn.resourcemanager.scheduler.monitor.enable":"false","fs.s3n.block.size":"67108864","ipc.client.connect.max.retries":"10","hadoop.registry.zk.retry.times":"5","mapreduce.jobtracker.staging.root.dir":"${hadoop.tmp.dir}/mapred/staging","yarn.nodemanager.resource-monitor.interval-ms":"3000","mapreduce.shuffle.listen.queue.size":"128","mapreduce.map.cpu.vcores":"1","yarn.timeline-service.client.fd-retain-secs":"300","hadoop.user.group.static.mapping.overrides":"dr.who=;","mapreduce.jobhistory.recovery.store.class":"org.apache.hadoop.mapreduce.v2.hs.HistoryServerFileSystemStateStoreService","yarn.resourcemanager.fail-fast":"${yarn.fail-fast}","yarn.resourcemanager.proxy-user-privileges.enabled":"false","mapreduce.job.reducer.preempt.delay.sec":"0","hadoop.util.hash.type":"murmur","yarn.app.mapreduce.client.job.max-retries":"0","mapreduce.reduce.shuffle.retry-delay.max.ms":"60000","hadoop.security.group.mapping.ldap.connection.timeout.ms":"60000","mapreduce.task.profile.params":"-agentlib:hprof=cpu=samples,heap=sites,force=n,thread=y,verbose=n,file=%s","yarn.app.mapreduce.shuffle.log.backups":"0","hadoop.registry.zk.retry.interval.ms":"1000","yarn.nodemanager.linux-container-executor.cgroups.delete-timeout-ms":"1000","fs.AbstractFileSystem.file.impl":"org.apache.hadoop.fs.local.LocalFs","yarn.nodemanager.log-aggregation.roll-monitoring-interval-seconds":"-1","mapreduce.jobhistory.cleaner.interval-ms":"86400000","hadoop.registry.zk.quorum":"localhost:2181","mapreduce.output.fileoutputformat.compress":"false","yarn.resourcemanager.am-rm-tokens.master-key-rolling-interval-secs":"*********(redacted)","hadoop.ssl.server.conf":"ssl-server.xml","yarn.sharedcache.cleaner.initial-delay-mins":"10","mapreduce.client.completion.pollinterval":"5000","hadoop.ssl.keystores.factory.class":"org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory","yarn.app.mapreduce.am.resource.cpu-vcores":"1","yarn.timeline-service.enabled":"false","yarn.nodemanager.runtime.linux.docker.capabilities":"CHOWN,DAC_OVERRIDE,FSETID,FOWNER,MKNOD,NET_RAW,SETGID,SETUID,SETFCAP,SETPCAP,NET_BIND_SERVICE,SYS_CHROOT,KILL,AUDIT_WRITE","yarn.acl.enable":"false","yarn.timeline-service.entity-group-fs-store.done-dir":"/tmp/entity-file-history/done/","mapreduce.task.profile":"false","yarn.resourcemanager.fs.state-store.uri":"${hadoop.tmp.dir}/yarn/system/rmstore","yarn.nodemanager.linux-container-executor.nonsecure-mode.local-user":"nobody","yarn.resourcemanager.configuration.provider-class":"org.apache.hadoop.yarn.LocalConfigurationProvider","yarn.resourcemanager.configuration.file-system-based-store":"/yarn/conf","yarn.nodemanager.resource.percentage-physical-cpu-limit":"100","mapreduce.jobhistory.client.thread-count":"10","tfile.fs.input.buffer.size":"262144","mapreduce.client.progressmonitor.pollinterval":"1000","yarn.nodemanager.log-dirs":"${yarn.log.dir}/userlogs","fs.automatic.close":"true","fs.s3n.multipart.copy.block.size":"5368709120","yarn.nodemanager.hostname":"0.0.0.0","yarn.resourcemanager.zk-timeout-ms":"10000","ftp.stream-buffer-size":"4096","yarn.fail-fast":"false","hadoop.security.group.mapping.ldap.search.filter.user":"(&(objectClass=user)(sAMAccountName={0}))","yarn.timeline-service.address":"${yarn.timeline-service.hostname}:10200","mapreduce.job.ubertask.maxmaps":"9","fs.s3a.threads.keepalivetime":"60","mapreduce.task.files.preserve.failedtasks":"false","yarn.app.mapreduce.client.job.retry-interval":"2000","ha.failover-controller.graceful-fence.connection.retries":"1","yarn.resourcemanager.delegation.token.max-lifetime":"*********(redacted)","yarn.timeline-service.entity-group-fs-store.summary-store":"org.apache.hadoop.yarn.server.timeline.LeveldbTimelineStore","mapreduce.reduce.cpu.vcores":"1","fs.client.resolve.remote.symlinks":"true","yarn.nodemanager.webapp.https.address":"0.0.0.0:8044","hadoop.http.cross-origin.allowed-origins":"*","yarn.timeline-service.entity-group-fs-store.retain-seconds":"604800","yarn.resourcemanager.metrics.runtime.buckets":"60,300,1440","yarn.timeline-service.generic-application-history.max-applications":"10000","yarn.nodemanager.local-dirs":"${hadoop.tmp.dir}/nm-local-dir","mapreduce.shuffle.connection-keep-alive.enable":"false","yarn.node-labels.configuration-type":"centralized","fs.s3a.path.style.access":"false","yarn.nodemanager.aux-services.mapreduce_shuffle.class":"org.apache.hadoop.mapred.ShuffleHandler","yarn.sharedcache.store.in-memory.staleness-period-mins":"10080","fs.adl.impl":"org.apache.hadoop.fs.adl.AdlFileSystem","yarn.resourcemanager.nodemanager.minimum.version":"NONE","net.topology.impl":"org.apache.hadoop.net.NetworkTopology","io.map.index.skip":"0","yarn.scheduler.maximum-allocation-vcores":"4","hadoop.http.cross-origin.allowed-headers":"X-Requested-With,Content-Type,Accept,Origin","yarn.nodemanager.log-aggregation.compression-type":"none","yarn.timeline-service.version":"1.0f","yarn.ipc.rpc.class":"org.apache.hadoop.yarn.ipc.HadoopYarnProtoRPC","mapreduce.reduce.maxattempts":"4","hadoop.security.dns.log-slow-lookups.enabled":"false","mapreduce.job.committer.setup.cleanup.needed":"true","mapreduce.job.running.reduce.limit":"0","ipc.maximum.response.length":"134217728","mapreduce.job.token.tracking.ids.enabled":"*********(redacted)","hadoop.caller.context.max.size":"128","hadoop.registry.system.acls":"sasl:yarn@, sasl:mapred@, sasl:hdfs@","yarn.nodemanager.recovery.dir":"${hadoop.tmp.dir}/yarn-nm-recovery","fs.s3a.fast.upload.buffer":"disk","mapreduce.jobhistory.intermediate-done-dir":"${yarn.app.mapreduce.am.staging-dir}/history/done_intermediate","yarn.app.mapreduce.shuffle.log.separate":"true","fs.s3a.max.total.tasks":"5","fs.s3a.readahead.range":"64K","hadoop.http.authentication.simple.anonymous.allowed":"true","fs.s3a.fast.upload":"false","fs.s3a.attempts.maximum":"20","hadoop.registry.zk.connection.timeout.ms":"15000","yarn.resourcemanager.delegation-token-renewer.thread-count":"*********(redacted)","yarn.nodemanager.health-checker.script.timeout-ms":"1200000","yarn.timeline-service.leveldb-timeline-store.start-time-write-cache-size":"10000","mapreduce.map.log.level":"INFO","mapreduce.output.fileoutputformat.compress.type":"RECORD","yarn.resourcemanager.leveldb-state-store.path":"${hadoop.tmp.dir}/yarn/system/rmstore","hadoop.registry.rm.enabled":"false","mapreduce.ifile.readahead.bytes":"4194304","yarn.resourcemanager.fs.state-store.retry-policy-spec":"2000, 500","yarn.sharedcache.app-checker.class":"org.apache.hadoop.yarn.server.sharedcachemanager.RemoteAppChecker","yarn.nodemanager.linux-container-executor.nonsecure-mode.limit-users":"true","yarn.nodemanager.resource.detect-hardware-capabilities":"false","mapreduce.cluster.acls.enabled":"false","mapreduce.job.speculative.retry-after-no-speculate":"1000","yarn.resourcemanager.fs.state-store.retry-interval-ms":"1000","file.stream-buffer-size":"4096","mapreduce.map.output.compress.codec":"org.apache.hadoop.io.compress.DefaultCodec","mapreduce.map.speculative":"true","mapreduce.job.speculative.retry-after-speculate":"15000","yarn.nodemanager.linux-container-executor.cgroups.mount":"false","yarn.app.mapreduce.am.container.log.backups":"0","yarn.app.mapreduce.am.log.level":"INFO","mapreduce.job.reduce.slowstart.completedmaps":"0.05","yarn.timeline-service.http-authentication.type":"simple","hadoop.security.group.mapping.ldap.search.attr.group.name":"cn","yarn.timeline-service.client.internal-timers-ttl-secs":"420","fs.s3a.block.size":"32M","yarn.sharedcache.client-server.address":"0.0.0.0:8045","yarn.resourcemanager.hostname":"0.0.0.0","yarn.resourcemanager.delegation.key.update-interval":"86400000","mapreduce.reduce.shuffle.fetch.retry.enabled":"${yarn.nodemanager.recovery.enabled}","mapreduce.map.memory.mb":"1024","mapreduce.task.skip.start.attempts":"2","fs.AbstractFileSystem.hdfs.impl":"org.apache.hadoop.fs.Hdfs","yarn.nodemanager.disk-health-checker.enable":"true","ipc.client.tcpnodelay":"true","ipc.client.rpc-timeout.ms":"0","fs.s3.maxRetries":"4","ipc.client.low-latency":"false","mapreduce.input.lineinputformat.linespermap":"1","ipc.client.connect.max.retries.on.timeouts":"45","yarn.timeline-service.leveldb-timeline-store.read-cache-size":"104857600","fs.AbstractFileSystem.har.impl":"org.apache.hadoop.fs.HarFs","mapreduce.job.split.metainfo.maxsize":"10000000","yarn.am.liveness-monitor.expiry-interval-ms":"600000","yarn.resourcemanager.container-tokens.master-key-rolling-interval-secs":"*********(redacted)","yarn.timeline-service.entity-group-fs-store.app-cache-size":"10","fs.s3a.socket.recv.buffer":"8192","fs.s3n.multipart.uploads.block.size":"67108864","yarn.resourcemanager.resource-tracker.address":"${yarn.resourcemanager.hostname}:8031","yarn.nodemanager.node-labels.provider.fetch-timeout-ms":"1200000","yarn.resourcemanager.leveldb-state-store.compaction-interval-secs":"3600","mapreduce.client.output.filter":"FAILED","hadoop.http.filter.initializers":"org.apache.hadoop.http.lib.StaticUserWebFilter","mapreduce.reduce.memory.mb":"1024","s3native.client-write-packet-size":"65536","yarn.timeline-service.hostname":"0.0.0.0","file.replication":"1","yarn.nodemanager.container-metrics.unregister-delay-ms":"10000","yarn.nodemanager.container-metrics.period-ms":"-1","yarn.nodemanager.log.retain-seconds":"10800","yarn.timeline-service.entity-group-fs-store.cleaner-interval-seconds":"3600","yarn.resourcemanager.keytab":"/etc/krb5.keytab","hadoop.security.group.mapping.providers.combined":"true","mapreduce.reduce.merge.inmem.threshold":"1000","yarn.timeline-service.recovery.enabled":"false","yarn.sharedcache.nm.uploader.thread-count":"20","mapreduce.shuffle.ssl.enabled":"false","yarn.resourcemanager.state-store.max-completed-applications":"${yarn.resourcemanager.max-completed-applications}","mapreduce.job.speculative.minimum-allowed-tasks":"10","yarn.log-aggregation.retain-seconds":"-1","yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb":"0","mapreduce.jobhistory.max-age-ms":"604800000","hadoop.http.cross-origin.allowed-methods":"GET,POST,HEAD","mapreduce.jobhistory.webapp.address":"0.0.0.0:19888","mapreduce.jobtracker.system.dir":"${hadoop.tmp.dir}/mapred/system","yarn.client.nodemanager-connect.max-wait-ms":"180000","yarn.resourcemanager.webapp.address":"${yarn.resourcemanager.hostname}:8088","mapreduce.jobhistory.recovery.enable":"false","mapreduce.reduce.shuffle.parallelcopies":"5","fs.AbstractFileSystem.webhdfs.impl":"org.apache.hadoop.fs.WebHdfs","fs.trash.interval":"0","yarn.app.mapreduce.client.max-retries":"3","hadoop.security.authentication":"simple","mapreduce.task.profile.reduce.params":"${mapreduce.task.profile.params}","yarn.app.mapreduce.am.resource.mb":"1536","mapreduce.input.fileinputformat.list-status.num-threads":"1","yarn.nodemanager.container-executor.class":"org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor","io.mapfile.bloom.size":"1048576","yarn.timeline-service.ttl-ms":"604800000","yarn.nodemanager.resource.cpu-vcores":"-1","mapreduce.job.reduces":"1","fs.s3a.multipart.size":"100M","yarn.scheduler.minimum-allocation-vcores":"1","mapreduce.job.speculative.speculative-cap-total-tasks":"0.01","hadoop.ssl.client.conf":"ssl-client.xml","mapreduce.job.queuename":"default","ha.health-monitor.sleep-after-disconnect.ms":"1000","s3.bytes-per-checksum":"512","yarn.app.mapreduce.shuffle.log.limit.kb":"0","hadoop.security.group.mapping":"org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback","yarn.client.application-client-protocol.poll-timeout-ms":"-1","mapreduce.jobhistory.jhist.format":"json","yarn.resourcemanager.ha.enabled":"false","hadoop.http.staticuser.user":"dr.who","mapreduce.task.exit.timeout.check-interval-ms":"20000","mapreduce.task.exit.timeout":"60000","yarn.nodemanager.linux-container-executor.resources-handler.class":"org.apache.hadoop.yarn.server.nodemanager.util.DefaultLCEResourcesHandler","mapreduce.reduce.shuffle.memory.limit.percent":"0.25","yarn.resourcemanager.reservation-system.enable":"false","s3.client-write-packet-size":"65536","mapreduce.map.output.compress":"false","ha.zookeeper.acl":"world:anyone:rwcda","ipc.server.max.connections":"0","yarn.scheduler.maximum-allocation-mb":"8192","yarn.resourcemanager.scheduler.monitor.policies":"org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy","yarn.sharedcache.cleaner.period-mins":"1440","yarn.app.mapreduce.am.container.log.limit.kb":"0","s3native.blocksize":"67108864","ipc.client.connect.retry.interval":"1000","yarn.resourcemanager.zk-state-store.parent-path":"/rmstore","mapreduce.jobhistory.cleaner.enable":"true","yarn.timeline-service.client.fd-flush-interval-secs":"10","hadoop.security.kms.client.encrypted.key.cache.expiry":"43200000","yarn.client.nodemanager-client-async.thread-pool-max-size":"500","mapreduce.map.maxattempts":"4","yarn.nodemanager.sleep-delay-before-sigkill.ms":"250","mapreduce.job.end-notification.retry.attempts":"0","yarn.nodemanager.resource.count-logical-processors-as-cores":"false","yarn.resourcemanager.zk-num-retries":"1000","hadoop.registry.zk.root":"/registry","adl.feature.ownerandgroup.enableupn":"false","mapreduce.job.reduce.shuffle.consumer.plugin.class":"org.apache.hadoop.mapreduce.task.reduce.Shuffle","yarn.resourcemanager.delayed.delegation-token.removal-interval-ms":"*********(redacted)","yarn.nodemanager.localizer.cache.target-size-mb":"10240","ftp.client-write-packet-size":"65536","fs.AbstractFileSystem.adl.impl":"org.apache.hadoop.fs.adl.Adl","yarn.client.failover-retries":"0","fs.s3a.multipart.purge.age":"86400","io.native.lib.available":"true","net.topology.node.switch.mapping.impl":"org.apache.hadoop.net.ScriptBasedMapping","yarn.nodemanager.amrmproxy.address":"0.0.0.0:8048","ipc.server.listen.queue.size":"128","map.sort.class":"org.apache.hadoop.util.QuickSort","fs.viewfs.rename.strategy":"SAME_MOUNTPOINT","hadoop.security.kms.client.authentication.retry-count":"1","fs.permissions.umask-mode":"022","yarn.nodemanager.vmem-check-enabled":"true","yarn.nodemanager.recovery.compaction-interval-secs":"3600","yarn.app.mapreduce.client-am.ipc.max-retries":"3","mapreduce.job.ubertask.maxreduces":"1","hadoop.security.kms.client.encrypted.key.cache.size":"500","hadoop.security.java.secure.random.algorithm":"SHA1PRNG","ha.failover-controller.cli-check.rpc-timeout.ms":"20000","mapreduce.jobhistory.jobname.limit":"50","yarn.client.nodemanager-connect.retry-interval-ms":"10000","yarn.timeline-service.state-store-class":"org.apache.hadoop.yarn.server.timeline.recovery.LeveldbTimelineStateStore","yarn.nodemanager.env-whitelist":"JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME","yarn.sharedcache.nested-level":"3","yarn.nodemanager.webapp.address":"${yarn.nodemanager.hostname}:8042","rpc.metrics.quantile.enable":"false","mapreduce.jobhistory.admin.acl":"*","yarn.resourcemanager.system-metrics-publisher.dispatcher.pool-size":"10","hadoop.http.authentication.kerberos.keytab":"${user.home}/hadoop.keytab","yarn.resourcemanager.recovery.enabled":"false"},"System Properties":{"java.io.tmpdir":"/tmp","line.separator":"\n","path.separator":":","sun.management.compiler":"HotSpot 64-Bit Tiered Compilers","SPARK_SUBMIT":"true","sun.cpu.endian":"little","java.specification.version":"1.8","java.vm.specification.name":"Java Virtual Machine Specification","java.vendor":"Oracle Corporation","java.vm.specification.version":"1.8","user.home":"/root","file.encoding.pkg":"sun.io","sun.nio.ch.bugLevel":"","sun.arch.data.model":"64","sun.boot.library.path":"/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.252.b09-2.el7_8.x86_64/jre/lib/amd64","user.dir":"/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8","java.library.path":"/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib","sun.cpu.isalist":"","os.arch":"amd64","java.vm.version":"25.252-b09","jetty.git.hash":"ab228fde9e55e9164c738d7fa121f8ac5acd51c9","java.endorsed.dirs":"/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.252.b09-2.el7_8.x86_64/jre/lib/endorsed","java.runtime.version":"1.8.0_252-b09","java.vm.info":"mixed mode","java.ext.dirs":"/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.252.b09-2.el7_8.x86_64/jre/lib/ext:/usr/java/packages/lib/ext","java.runtime.name":"OpenJDK Runtime Environment","file.separator":"/","java.class.version":"52.0","java.specification.name":"Java Platform API Specification","sun.boot.class.path":"/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.252.b09-2.el7_8.x86_64/jre/lib/resources.jar:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.252.b09-2.el7_8.x86_64/jre/lib/rt.jar:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.252.b09-2.el7_8.x86_64/jre/lib/sunrsasign.jar:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.252.b09-2.el7_8.x86_64/jre/lib/jsse.jar:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.252.b09-2.el7_8.x86_64/jre/lib/jce.jar:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.252.b09-2.el7_8.x86_64/jre/lib/charsets.jar:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.252.b09-2.el7_8.x86_64/jre/lib/jfr.jar:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.252.b09-2.el7_8.x86_64/jre/classes","file.encoding":"UTF-8","user.timezone":"Asia/Shanghai","java.specification.vendor":"Oracle Corporation","sun.java.launcher":"SUN_STANDARD","os.version":"3.10.0-1127.10.1.el7.x86_64","sun.os.patch.level":"unknown","java.vm.specification.vendor":"Oracle Corporation","user.country":"US","sun.jnu.encoding":"UTF-8","user.language":"en","java.vendor.url":"http://java.oracle.com/","java.awt.printerjob":"sun.print.PSPrinterJob","java.awt.graphicsenv":"sun.awt.X11GraphicsEnvironment","awt.toolkit":"sun.awt.X11.XToolkit","os.name":"Linux","java.vm.vendor":"Oracle Corporation","java.vendor.url.bug":"http://bugreport.sun.com/bugreport/","user.name":"root","java.vm.name":"OpenJDK 64-Bit Server VM","sun.java.command":"org.apache.spark.deploy.SparkSubmit --master local[*] --conf spark.eventLog.dir=/tmp/spark-history --conf spark.eventLog.enabled=true --conf spark.sql.shuffle.partitions=2 --class org.apache.spark.examples.sql.streaming.StructuredKafkaWordCount ./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar 192.168.130.97:9092 subscribe test5","java.home":"/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.252.b09-2.el7_8.x86_64/jre","java.version":"1.8.0_252","sun.io.unicode.encoding":"UnicodeLittle"},"Classpath Entries":{"/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/metrics-graphite-4.1.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/nimbus-jose-jwt-4.41.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hive-vector-code-gen-2.3.7.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jackson-jaxrs-1.9.13.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jersey-server-2.30.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/pyrolite-4.30.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/conf/":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/json-smart-2.3.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/objenesis-2.5.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hadoop-auth-2.8.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jsp-api-2.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-unsafe_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-codec-1.10.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/protobuf-java-2.5.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/avro-1.8.2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/guice-3.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/aopalliance-repackaged-2.6.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/transaction-api-1.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spire_2.12-0.17.0-M1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/gson-2.2.4.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/datanucleus-rdbms-4.1.19.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jackson-module-paranamer-2.10.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/libfb303-0.9.3.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-cli-1.2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-tags_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/scala-library-2.12.10.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/xbean-asm7-shaded-4.15.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jersey-container-servlet-2.30.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hk2-api-2.6.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jakarta.xml.bind-api-2.3.2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/okhttp-2.4.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/derby-10.12.1.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jackson-core-asl-1.9.13.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-collections-3.2.2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/httpcore-4.4.12.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-beanutils-1.9.4.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spire-util_2.12-0.17.0-M1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-crypto-1.0.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-launcher_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/stax-api-1.0-2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/json4s-ast_2.12-3.6.6.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/lz4-java-1.7.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/scala-parser-combinators_2.12-1.1.2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/parquet-format-2.4.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/parquet-column-1.10.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-logging-1.1.3.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/audience-annotations-0.5.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hive-jdbc-2.3.7.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-hive-thriftserver_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hive-cli-2.3.7.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/javolution-5.5.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/JLargeArrays-1.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hadoop-yarn-api-2.8.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/algebra_2.12-2.0.0-M2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-dbcp-1.4.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jakarta.ws.rs-api-2.1.6.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/threeten-extra-1.5.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-io-2.4.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/metrics-json-4.1.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/libthrift-0.12.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/compress-lzf-1.0.3.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/metrics-jmx-4.1.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jakarta.inject-2.6.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/stax-api-1.0.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hive-shims-common-2.3.7.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/curator-recipes-2.7.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/antlr4-runtime-4.7.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/slf4j-api-1.7.30.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/oro-2.0.8.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/arrow-memory-0.15.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jpam-1.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/velocity-1.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/orc-core-1.5.10.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-sql_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jackson-databind-2.10.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-text-1.6.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jersey-client-2.30.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/htrace-core4-4.0.1-incubating.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/json-1.8.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-graphx_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/avro-ipc-1.8.2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/macro-compat_2.12-1.1.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jetty-util-6.1.26.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/core-1.1.2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hive-shims-2.3.7.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/joda-time-2.10.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/parquet-encoding-1.10.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hive-llap-common-2.3.7.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-network-common_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/datanucleus-api-jdo-4.2.4.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/paranamer-2.8.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hive-shims-0.23-2.3.7.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/activation-1.1.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/curator-framework-2.7.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-compress-1.8.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hadoop-mapreduce-client-common-2.8.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/RoaringBitmap-0.7.45.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/ivy-2.4.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jackson-core-2.10.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hadoop-yarn-client-2.8.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-httpclient-3.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-yarn_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/parquet-common-1.10.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/zstd-jni-1.4.5-2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jersey-container-servlet-core-2.30.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/snappy-java-1.1.7.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/shapeless_2.12-2.3.3.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-pool-1.5.4.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/json4s-core_2.12-3.6.6.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/httpclient-4.5.6.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/api-util-1.0.0-M20.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/aircompressor-0.10.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-repl_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hadoop-yarn-common-2.8.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/leveldbjni-all-1.8.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jersey-hk2-2.30.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jta-1.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jetty-sslengine-6.1.26.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-net-3.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/datanucleus-core-4.1.17.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hadoop-yarn-server-web-proxy-2.8.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/breeze_2.12-1.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/orc-mapreduce-1.5.10.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jetty-6.1.26.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-core_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/xz-1.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/javax.inject-1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/scala-compiler-2.12.10.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/metrics-jvm-4.1.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/orc-shims-1.5.10.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jaxb-api-2.2.2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jakarta.validation-api-2.0.2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spire-macros_2.12-0.17.0-M1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/janino-3.1.2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/osgi-resource-locator-1.0.3.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jcl-over-slf4j-1.7.30.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hadoop-mapreduce-client-app-2.8.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hk2-utils-2.6.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-sketch_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/JTransforms-3.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/kafka-clients-2.4.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/guice-servlet-3.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/okio-1.4.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hadoop-annotations-2.8.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-math3-3.4.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/json4s-scalap_2.12-3.6.6.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/bonecp-0.8.0.RELEASE.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-streaming_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/accessors-smart-1.2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/guava-14.0.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/shims-0.7.45.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/ST4-4.0.4.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jackson-module-scala_2.12-2.10.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/scala-xml_2.12-1.2.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/chill-java-0.9.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hadoop-mapreduce-client-shuffle-2.8.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/cats-kernel_2.12-2.0.0-M4.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/stream-2.9.6.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-configuration-1.6.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jodd-core-3.5.2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/scala-collection-compat_2.12-2.1.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-pool2-2.6.2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jul-to-slf4j-1.7.30.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/xmlenc-0.52.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/flatbuffers-java-1.9.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-token-provider-kafka-0-10_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/json4s-jackson_2.12-3.6.6.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-compiler-3.1.2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jline-2.14.6.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/breeze-macros_2.12-1.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/kryo-shaded-4.0.2.jar":"System Classpath","spark://iZbp19vpr16ix621sdw476Z:46309/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar":"Added By User","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hadoop-common-2.8.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-hive_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jersey-common-2.30.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/istack-commons-runtime-3.0.8.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/curator-client-2.7.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jackson-xc-1.9.13.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/zookeeper-3.4.14.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/parquet-hadoop-1.10.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jakarta.annotation-api-1.3.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hive-shims-scheduler-2.3.7.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/univocity-parsers-2.8.3.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-digester-1.8.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-mllib_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/arpack_combined_all-0.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-sql-kafka-0-10_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jackson-annotations-2.10.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hk2-locator-2.6.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hadoop-mapreduce-client-core-2.8.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/avro-mapred-1.8.2-hadoop2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hadoop-yarn-server-common-2.8.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/scala-reflect-2.12.10.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/super-csv-2.2.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hadoop-mapreduce-client-jobclient-2.8.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hadoop-client-2.8.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hive-common-2.3.7.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/apacheds-kerberos-codec-2.0.0-M15.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hive-exec-2.3.7-core.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/opencsv-2.3.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/api-asn1-api-1.0.0-M20.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hive-storage-api-2.7.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spire-platform_2.12-0.17.0-M1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/aopalliance-1.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/HikariCP-2.5.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hive-metastore-2.3.7.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/minlog-1.3.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/arrow-format-0.15.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jsr305-3.0.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-lang-2.6.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/commons-lang3-3.9.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/javax.jdo-3.2.0-m3.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/apacheds-i18n-2.0.0-M15.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/javassist-3.25.0-GA.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jackson-mapper-asl-1.9.13.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/antlr-runtime-3.5.2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/log4j-1.2.17.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hive-beeline-2.3.7.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/chill_2.12-0.9.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jdo-api-3.0.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-kvstore_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/metrics-core-4.1.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jaxb-runtime-2.3.2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-mllib-local_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/py4j-0.10.9.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/javax.servlet-api-3.1.0.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hive-serde-2.3.7.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/hadoop-hdfs-client-2.8.5.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-network-shuffle_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jcip-annotations-1.0-1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/jersey-media-jaxb-2.30.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/parquet-jackson-1.10.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/slf4j-log4j12-1.7.30.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/netty-all-4.1.47.Final.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/arrow-vector-0.15.1.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/spark-catalyst_2.12-3.1.0-SNAPSHOT.jar":"System Classpath","/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/jars/machinist_2.12-0.6.8.jar":"System Classpath"}} +{"Event":"SparkListenerApplicationStart","App Name":"StructuredKafkaWordCount","App ID":"local-1596020211915","Timestamp":1596020210919,"User":"root"} +{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryStartedEvent","id":"8d268dc2-bc9c-4be8-97a9-b135d2943028","runId":"e225d92f-2545-48f8-87a2-9c0309580f8a","name":null,"timestamp":"2020-07-29T10:56:55.947Z"} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":0,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 0","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nWriteToDataSourceV2 (14)\n+- * HashAggregate (13)\n +- StateStoreSave (12)\n +- * HashAggregate (11)\n +- StateStoreRestore (10)\n +- Exchange (9)\n +- * HashAggregate (8)\n +- * HashAggregate (7)\n +- * SerializeFromObject (6)\n +- MapPartitions (5)\n +- DeserializeToObject (4)\n +- * Project (3)\n +- * Project (2)\n +- MicroBatchScan (1)\n\n\n(1) MicroBatchScan\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nArguments: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@7e7b182c, KafkaV2[Subscribe[test5]], {\"test5\":{\"0\":48276}}, {\"test5\":{\"0\":48279}}\n\n(2) Project [codegen id : 1]\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(3) Project [codegen id : 1]\nOutput [1]: [cast(value#8 as string) AS value#21]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(4) DeserializeToObject\nInput [1]: [value#21]\nArguments: value#21.toString, obj#27: java.lang.String\n\n(5) MapPartitions\nInput [1]: [obj#27]\nArguments: org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String\n\n(6) SerializeFromObject [codegen id : 2]\nInput [1]: [obj#28]\nArguments: [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]\n\n(7) HashAggregate [codegen id : 2]\nInput [1]: [value#29]\nKeys [1]: [value#29]\nFunctions [1]: [partial_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(8) HashAggregate [codegen id : 2]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(9) Exchange\nInput [2]: [value#29, count#38L]\nArguments: hashpartitioning(value#29, 2), true, [id=#142]\n\n(10) StateStoreRestore\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = f7faa1e9-69d9-41b4-9d77-919795af2413, opId = 0, ver = 0, numPartitions = 2], 2\n\n(11) HashAggregate [codegen id : 3]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(12) StateStoreSave\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = f7faa1e9-69d9-41b4-9d77-919795af2413, opId = 0, ver = 0, numPartitions = 2], Append, 0, 2\n\n(13) HashAggregate [codegen id : 4]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count(1)#31L AS count#32L]\n\n(14) WriteToDataSourceV2\nInput [2]: [value#29, count#32L]\nArguments: org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@27fafcca\n\n","sparkPlanInfo":{"nodeName":"WriteToDataSourceV2","simpleString":"WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@27fafcca","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreSave","simpleString":"StateStoreSave [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 0, numPartitions = 2], Complete, 0, 2","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreRestore","simpleString":"StateStoreRestore [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 0, numPartitions = 2], 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(value#29, 2), true, [id=#66]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[partial_count(1)])","children":[{"nodeName":"SerializeFromObject","simpleString":"SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MapPartitions","simpleString":"MapPartitions org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String","children":[{"nodeName":"DeserializeToObject","simpleString":"DeserializeToObject value#21.toString, obj#27: java.lang.String","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [cast(value#8 as string) AS value#21]","children":[{"nodeName":"Project","simpleString":"Project [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MicroBatchScan","simpleString":"MicroBatchScan[key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13] class org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":80,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":79,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":76,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":77,"metricType":"timing"},{"name":"peak memory","accumulatorId":75,"metricType":"size"},{"name":"number of output rows","accumulatorId":74,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":78,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":71,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":72,"metricType":"timing"},{"name":"peak memory","accumulatorId":70,"metricType":"size"},{"name":"number of output rows","accumulatorId":69,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":73,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":68,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":20,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":21,"metricType":"nsTiming"},{"name":"records read","accumulatorId":18,"metricType":"sum"},{"name":"local bytes read","accumulatorId":16,"metricType":"size"},{"name":"fetch wait time","accumulatorId":17,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":14,"metricType":"size"},{"name":"local blocks read","accumulatorId":13,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":12,"metricType":"sum"},{"name":"data size","accumulatorId":11,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":15,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":19,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":67,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":64,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":65,"metricType":"timing"},{"name":"peak memory","accumulatorId":63,"metricType":"size"},{"name":"number of output rows","accumulatorId":62,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":66,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":61,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of inputs which are later than watermark ('inputs' are relative to operators)","accumulatorId":51,"metricType":"sum"},{"name":"number of total state rows","accumulatorId":52,"metricType":"sum"},{"name":"memory used by state","accumulatorId":57,"metricType":"size"},{"name":"count of cache hit on states cache in provider","accumulatorId":59,"metricType":"sum"},{"name":"number of output rows","accumulatorId":50,"metricType":"sum"},{"name":"estimated size of state only on current version","accumulatorId":58,"metricType":"size"},{"name":"count of cache miss on states cache in provider","accumulatorId":60,"metricType":"sum"},{"name":"time to commit changes","accumulatorId":56,"metricType":"timing"},{"name":"time to remove","accumulatorId":55,"metricType":"timing"},{"name":"number of updated state rows","accumulatorId":53,"metricType":"sum"},{"name":"time to update","accumulatorId":54,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":47,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":48,"metricType":"timing"},{"name":"peak memory","accumulatorId":46,"metricType":"size"},{"name":"number of output rows","accumulatorId":45,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":49,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":44,"metricType":"timing"}]}],"metadata":{},"metrics":[]},"time":1596020220179} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":1,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 0","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nWriteToDataSourceV2 (14)\n+- * HashAggregate (13)\n +- StateStoreSave (12)\n +- * HashAggregate (11)\n +- StateStoreRestore (10)\n +- Exchange (9)\n +- * HashAggregate (8)\n +- * HashAggregate (7)\n +- * SerializeFromObject (6)\n +- MapPartitions (5)\n +- DeserializeToObject (4)\n +- * Project (3)\n +- * Project (2)\n +- MicroBatchScan (1)\n\n\n(1) MicroBatchScan\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nArguments: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@7e7b182c, KafkaV2[Subscribe[test5]], {\"test5\":{\"0\":48276}}, {\"test5\":{\"0\":48279}}\n\n(2) Project [codegen id : 1]\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(3) Project [codegen id : 1]\nOutput [1]: [cast(value#8 as string) AS value#21]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(4) DeserializeToObject\nInput [1]: [value#21]\nArguments: value#21.toString, obj#27: java.lang.String\n\n(5) MapPartitions\nInput [1]: [obj#27]\nArguments: org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String\n\n(6) SerializeFromObject [codegen id : 2]\nInput [1]: [obj#28]\nArguments: [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]\n\n(7) HashAggregate [codegen id : 2]\nInput [1]: [value#29]\nKeys [1]: [value#29]\nFunctions [1]: [partial_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(8) HashAggregate [codegen id : 2]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(9) Exchange\nInput [2]: [value#29, count#38L]\nArguments: hashpartitioning(value#29, 2), true, [id=#218]\n\n(10) StateStoreRestore\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 64a4779b-846a-4f20-9f5c-899a8dbf68d8, opId = 0, ver = 0, numPartitions = 2], 2\n\n(11) HashAggregate [codegen id : 3]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(12) StateStoreSave\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 64a4779b-846a-4f20-9f5c-899a8dbf68d8, opId = 0, ver = 0, numPartitions = 2], Append, 0, 2\n\n(13) HashAggregate [codegen id : 4]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count(1)#31L AS count#32L]\n\n(14) WriteToDataSourceV2\nInput [2]: [value#29, count#32L]\nArguments: org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@27fafcca\n\n","sparkPlanInfo":{"nodeName":"WriteToDataSourceV2","simpleString":"WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@27fafcca","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreSave","simpleString":"StateStoreSave [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 0, numPartitions = 2], Complete, 0, 2","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreRestore","simpleString":"StateStoreRestore [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 0, numPartitions = 2], 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(value#29, 2), true, [id=#66]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[partial_count(1)])","children":[{"nodeName":"SerializeFromObject","simpleString":"SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MapPartitions","simpleString":"MapPartitions org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String","children":[{"nodeName":"DeserializeToObject","simpleString":"DeserializeToObject value#21.toString, obj#27: java.lang.String","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [cast(value#8 as string) AS value#21]","children":[{"nodeName":"Project","simpleString":"Project [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MicroBatchScan","simpleString":"MicroBatchScan[key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13] class org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":80,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":79,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":76,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":77,"metricType":"timing"},{"name":"peak memory","accumulatorId":75,"metricType":"size"},{"name":"number of output rows","accumulatorId":74,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":78,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":71,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":72,"metricType":"timing"},{"name":"peak memory","accumulatorId":70,"metricType":"size"},{"name":"number of output rows","accumulatorId":69,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":73,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":68,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":20,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":21,"metricType":"nsTiming"},{"name":"records read","accumulatorId":18,"metricType":"sum"},{"name":"local bytes read","accumulatorId":16,"metricType":"size"},{"name":"fetch wait time","accumulatorId":17,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":14,"metricType":"size"},{"name":"local blocks read","accumulatorId":13,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":12,"metricType":"sum"},{"name":"data size","accumulatorId":11,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":15,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":19,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":67,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":64,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":65,"metricType":"timing"},{"name":"peak memory","accumulatorId":63,"metricType":"size"},{"name":"number of output rows","accumulatorId":62,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":66,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":61,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of inputs which are later than watermark ('inputs' are relative to operators)","accumulatorId":51,"metricType":"sum"},{"name":"number of total state rows","accumulatorId":52,"metricType":"sum"},{"name":"memory used by state","accumulatorId":57,"metricType":"size"},{"name":"count of cache hit on states cache in provider","accumulatorId":59,"metricType":"sum"},{"name":"number of output rows","accumulatorId":50,"metricType":"sum"},{"name":"estimated size of state only on current version","accumulatorId":58,"metricType":"size"},{"name":"count of cache miss on states cache in provider","accumulatorId":60,"metricType":"sum"},{"name":"time to commit changes","accumulatorId":56,"metricType":"timing"},{"name":"time to remove","accumulatorId":55,"metricType":"timing"},{"name":"number of updated state rows","accumulatorId":53,"metricType":"sum"},{"name":"time to update","accumulatorId":54,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":47,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":48,"metricType":"timing"},{"name":"peak memory","accumulatorId":46,"metricType":"size"},{"name":"number of output rows","accumulatorId":45,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":49,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":44,"metricType":"timing"}]}],"metadata":{},"metrics":[]},"time":1596020220258} +{"Event":"SparkListenerJobStart","Job ID":0,"Submission Time":1596020221633,"Stage Infos":[{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":6,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"8\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[5],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":1,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[0],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"16\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[1],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"9\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"14\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":0,"Name":"DataSourceRDD","Scope":"{\"id\":\"20\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Accumulables":[],"Resource Profile Id":0},{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":11,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"0\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[10],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":7,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"8\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":9,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"4\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[8],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":10,"Name":"StateStoreRDD","Scope":"{\"id\":\"3\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[9],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":8,"Name":"StateStoreRDD","Scope":"{\"id\":\"7\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[7],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[0],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Accumulables":[],"Resource Profile Id":0}],"Stage IDs":[0,1],"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 0","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"0","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"1","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":6,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"8\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[5],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":1,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[0],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"16\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[1],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"9\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"14\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":0,"Name":"DataSourceRDD","Scope":"{\"id\":\"20\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020221656,"Accumulables":[],"Resource Profile Id":0},"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 0","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"0","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"1","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1596020221738,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1596020221738,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020222649,"Failed":false,"Killed":false,"Accumulables":[{"ID":21,"Name":"shuffle write time","Update":"9599308","Value":"9599308","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":20,"Name":"shuffle records written","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":19,"Name":"shuffle bytes written","Update":"168","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":11,"Name":"data size","Update":"128","Value":"128","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":68,"Name":"duration","Update":"296","Value":"296","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":69,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":70,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":72,"Name":"time in aggregation build","Update":"200","Value":"200","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":74,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":75,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":77,"Name":"time in aggregation build","Update":"190","Value":"190","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":79,"Name":"duration","Update":"336","Value":"336","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":80,"Name":"number of output rows","Update":"3","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":125,"Name":"internal.metrics.input.recordsRead","Update":3,"Value":3,"Internal":true,"Count Failed Values":true},{"ID":123,"Name":"internal.metrics.shuffle.write.writeTime","Update":9599308,"Value":9599308,"Internal":true,"Count Failed Values":true},{"ID":122,"Name":"internal.metrics.shuffle.write.recordsWritten","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":121,"Name":"internal.metrics.shuffle.write.bytesWritten","Update":168,"Value":168,"Internal":true,"Count Failed Values":true},{"ID":112,"Name":"internal.metrics.peakExecutionMemory","Update":524288,"Value":524288,"Internal":true,"Count Failed Values":true},{"ID":109,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":108,"Name":"internal.metrics.jvmGCTime","Update":17,"Value":17,"Internal":true,"Count Failed Values":true},{"ID":107,"Name":"internal.metrics.resultSize","Update":2630,"Value":2630,"Internal":true,"Count Failed Values":true},{"ID":106,"Name":"internal.metrics.executorCpuTime","Update":466139164,"Value":466139164,"Internal":true,"Count Failed Values":true},{"ID":105,"Name":"internal.metrics.executorRunTime","Update":503,"Value":503,"Internal":true,"Count Failed Values":true},{"ID":104,"Name":"internal.metrics.executorDeserializeCpuTime","Update":301869581,"Value":301869581,"Internal":true,"Count Failed Values":true},{"ID":103,"Name":"internal.metrics.executorDeserializeTime","Update":361,"Value":361,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":361,"Executor Deserialize CPU Time":301869581,"Executor Run Time":503,"Executor CPU Time":466139164,"Peak Execution Memory":524288,"Result Size":2630,"JVM GC Time":17,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":168,"Shuffle Write Time":9599308,"Shuffle Records Written":1},"Input Metrics":{"Bytes Read":0,"Records Read":3},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":6,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"8\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[5],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":1,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[0],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"16\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[1],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"9\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"14\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":0,"Name":"DataSourceRDD","Scope":"{\"id\":\"20\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020221656,"Completion Time":1596020222661,"Accumulables":[{"ID":104,"Name":"internal.metrics.executorDeserializeCpuTime","Value":301869581,"Internal":true,"Count Failed Values":true},{"ID":122,"Name":"internal.metrics.shuffle.write.recordsWritten","Value":1,"Internal":true,"Count Failed Values":true},{"ID":77,"Name":"time in aggregation build","Value":"190","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":68,"Name":"duration","Value":"296","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":80,"Name":"number of output rows","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":125,"Name":"internal.metrics.input.recordsRead","Value":3,"Internal":true,"Count Failed Values":true},{"ID":107,"Name":"internal.metrics.resultSize","Value":2630,"Internal":true,"Count Failed Values":true},{"ID":74,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":11,"Name":"data size","Value":"128","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":20,"Name":"shuffle records written","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":106,"Name":"internal.metrics.executorCpuTime","Value":466139164,"Internal":true,"Count Failed Values":true},{"ID":109,"Name":"internal.metrics.resultSerializationTime","Value":1,"Internal":true,"Count Failed Values":true},{"ID":121,"Name":"internal.metrics.shuffle.write.bytesWritten","Value":168,"Internal":true,"Count Failed Values":true},{"ID":112,"Name":"internal.metrics.peakExecutionMemory","Value":524288,"Internal":true,"Count Failed Values":true},{"ID":103,"Name":"internal.metrics.executorDeserializeTime","Value":361,"Internal":true,"Count Failed Values":true},{"ID":79,"Name":"duration","Value":"336","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":70,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":19,"Name":"shuffle bytes written","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":123,"Name":"internal.metrics.shuffle.write.writeTime","Value":9599308,"Internal":true,"Count Failed Values":true},{"ID":105,"Name":"internal.metrics.executorRunTime","Value":503,"Internal":true,"Count Failed Values":true},{"ID":69,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":72,"Name":"time in aggregation build","Value":"200","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":108,"Name":"internal.metrics.jvmGCTime","Value":17,"Internal":true,"Count Failed Values":true},{"ID":21,"Name":"shuffle write time","Value":"9599308","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":75,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"}],"Resource Profile Id":0}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":11,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"0\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[10],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":7,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"8\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":9,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"4\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[8],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":10,"Name":"StateStoreRDD","Scope":"{\"id\":\"3\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[9],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":8,"Name":"StateStoreRDD","Scope":"{\"id\":\"7\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[7],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[0],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020222688,"Accumulables":[],"Resource Profile Id":0},"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 0","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"0","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"1","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":1,"Index":1,"Attempt":0,"Launch Time":1596020222709,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"NODE_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":2,"Index":0,"Attempt":0,"Launch Time":1596020222713,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":2,"Index":0,"Attempt":0,"Launch Time":1596020222713,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020222954,"Failed":false,"Killed":false,"Accumulables":[{"ID":44,"Name":"duration","Update":"19","Value":"19","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":46,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":48,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":54,"Name":"time to update","Update":"14","Value":"14","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":55,"Name":"time to remove","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":56,"Name":"time to commit changes","Update":"50","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":58,"Name":"estimated size of state only on current version","Update":"64","Value":"64","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":57,"Name":"memory used by state","Update":"208","Value":"208","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":61,"Name":"duration","Update":"14","Value":"14","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":63,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":65,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":145,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":144,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":143,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":142,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":141,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":140,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":139,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":137,"Name":"internal.metrics.peakExecutionMemory","Update":524288,"Value":524288,"Internal":true,"Count Failed Values":true},{"ID":134,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":132,"Name":"internal.metrics.resultSize","Update":5354,"Value":5354,"Internal":true,"Count Failed Values":true},{"ID":131,"Name":"internal.metrics.executorCpuTime","Update":93367533,"Value":93367533,"Internal":true,"Count Failed Values":true},{"ID":130,"Name":"internal.metrics.executorRunTime","Update":203,"Value":203,"Internal":true,"Count Failed Values":true},{"ID":129,"Name":"internal.metrics.executorDeserializeCpuTime","Update":10308753,"Value":10308753,"Internal":true,"Count Failed Values":true},{"ID":128,"Name":"internal.metrics.executorDeserializeTime","Update":23,"Value":23,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":23,"Executor Deserialize CPU Time":10308753,"Executor Run Time":203,"Executor CPU Time":93367533,"Peak Execution Memory":524288,"Result Size":5354,"JVM GC Time":0,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":1,"Index":1,"Attempt":0,"Launch Time":1596020222709,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"NODE_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020222965,"Failed":false,"Killed":false,"Accumulables":[{"ID":44,"Name":"duration","Update":"33","Value":"52","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":49,"Name":"avg hash probe bucket list iters","Update":"10","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":45,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":46,"Name":"peak memory","Update":"4456448","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":48,"Name":"time in aggregation build","Update":"19","Value":"19","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":54,"Name":"time to update","Update":"28","Value":"42","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":53,"Name":"number of updated state rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":55,"Name":"time to remove","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":56,"Name":"time to commit changes","Update":"31","Value":"81","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":58,"Name":"estimated size of state only on current version","Update":"424","Value":"488","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":50,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":57,"Name":"memory used by state","Update":"568","Value":"776","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":52,"Name":"number of total state rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":61,"Name":"duration","Update":"28","Value":"42","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":62,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":63,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":65,"Name":"time in aggregation build","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":67,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":13,"Name":"local blocks read","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":17,"Name":"fetch wait time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":16,"Name":"local bytes read","Update":"168","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":18,"Name":"records read","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":145,"Name":"internal.metrics.shuffle.read.recordsRead","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":144,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":143,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":168,"Value":168,"Internal":true,"Count Failed Values":true},{"ID":142,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":141,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":140,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":139,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":137,"Name":"internal.metrics.peakExecutionMemory","Update":4718592,"Value":5242880,"Internal":true,"Count Failed Values":true},{"ID":132,"Name":"internal.metrics.resultSize","Update":5574,"Value":10928,"Internal":true,"Count Failed Values":true},{"ID":131,"Name":"internal.metrics.executorCpuTime","Update":91355172,"Value":184722705,"Internal":true,"Count Failed Values":true},{"ID":130,"Name":"internal.metrics.executorRunTime","Update":205,"Value":408,"Internal":true,"Count Failed Values":true},{"ID":129,"Name":"internal.metrics.executorDeserializeCpuTime","Update":21029530,"Value":31338283,"Internal":true,"Count Failed Values":true},{"ID":128,"Name":"internal.metrics.executorDeserializeTime","Update":34,"Value":57,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":34,"Executor Deserialize CPU Time":21029530,"Executor Run Time":205,"Executor CPU Time":91355172,"Peak Execution Memory":4718592,"Result Size":5574,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":1,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":168,"Total Records Read":1},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":11,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"0\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[10],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":7,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"8\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":9,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"4\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[8],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":10,"Name":"StateStoreRDD","Scope":"{\"id\":\"3\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[9],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":8,"Name":"StateStoreRDD","Scope":"{\"id\":\"7\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[7],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[0],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020222688,"Completion Time":1596020222967,"Accumulables":[{"ID":137,"Name":"internal.metrics.peakExecutionMemory","Value":5242880,"Internal":true,"Count Failed Values":true},{"ID":128,"Name":"internal.metrics.executorDeserializeTime","Value":57,"Internal":true,"Count Failed Values":true},{"ID":131,"Name":"internal.metrics.executorCpuTime","Value":184722705,"Internal":true,"Count Failed Values":true},{"ID":50,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":140,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":1,"Internal":true,"Count Failed Values":true},{"ID":53,"Name":"number of updated state rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":62,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":17,"Name":"fetch wait time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":134,"Name":"internal.metrics.resultSerializationTime","Value":1,"Internal":true,"Count Failed Values":true},{"ID":44,"Name":"duration","Value":"52","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":56,"Name":"time to commit changes","Value":"81","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":65,"Name":"time in aggregation build","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":142,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":46,"Name":"peak memory","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":145,"Name":"internal.metrics.shuffle.read.recordsRead","Value":1,"Internal":true,"Count Failed Values":true},{"ID":55,"Name":"time to remove","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":49,"Name":"avg hash probe bucket list iters","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":67,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":139,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":58,"Name":"estimated size of state only on current version","Value":"488","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":13,"Name":"local blocks read","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":130,"Name":"internal.metrics.executorRunTime","Value":408,"Internal":true,"Count Failed Values":true},{"ID":16,"Name":"local bytes read","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":52,"Name":"number of total state rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":61,"Name":"duration","Value":"42","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":129,"Name":"internal.metrics.executorDeserializeCpuTime","Value":31338283,"Internal":true,"Count Failed Values":true},{"ID":132,"Name":"internal.metrics.resultSize","Value":10928,"Internal":true,"Count Failed Values":true},{"ID":141,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":45,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":63,"Name":"peak memory","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":54,"Name":"time to update","Value":"42","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":144,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":18,"Name":"records read","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":57,"Name":"memory used by state","Value":"776","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":48,"Name":"time in aggregation build","Value":"19","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":143,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":168,"Internal":true,"Count Failed Values":true}],"Resource Profile Id":0}} +{"Event":"SparkListenerJobEnd","Job ID":0,"Completion Time":1596020222973,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":2,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 0","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nLocalTableScan (1)\n\n\n(1) LocalTableScan\nOutput [2]: [value#46, count#47]\nArguments: [value#46, count#47]\n\n","sparkPlanInfo":{"nodeName":"LocalTableScan","simpleString":"LocalTableScan [value#46, count#47]","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":153,"metricType":"sum"}]},"time":1596020223028} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":2,"time":1596020223062} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":1,"time":1596020223069} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":0,"time":1596020223069} +{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent","progress":{"id":"8d268dc2-bc9c-4be8-97a9-b135d2943028","runId":"e225d92f-2545-48f8-87a2-9c0309580f8a","name":null,"timestamp":"2020-07-29T10:56:56.015Z","batchId":0,"batchDuration":7110,"durationMs":{"triggerExecution":7109,"queryPlanning":439,"getBatch":21,"latestOffset":3524,"addBatch":3011,"walCommit":35},"eventTime":{},"stateOperators":[{"numRowsTotal":1,"numRowsUpdated":1,"memoryUsedBytes":776,"numLateInputs":0,"customMetrics":{"stateOnCurrentVersionSizeBytes":488,"loadedMapCacheHitCount":0,"loadedMapCacheMissCount":0}}],"sources":[{"description":"KafkaV2[Subscribe[test5]]","startOffset":null,"endOffset":"{\"test5\":{\"0\":48279}}","numInputRows":3,"inputRowsPerSecond":"NaN","processedRowsPerSecond":0.42194092827004215}],"sink":{"description":"org.apache.spark.sql.execution.streaming.ConsoleTable$@514ba885","numOutputRows":1},"observedMetrics":{}}} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":3,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 1","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nWriteToDataSourceV2 (14)\n+- * HashAggregate (13)\n +- StateStoreSave (12)\n +- * HashAggregate (11)\n +- StateStoreRestore (10)\n +- Exchange (9)\n +- * HashAggregate (8)\n +- * HashAggregate (7)\n +- * SerializeFromObject (6)\n +- MapPartitions (5)\n +- DeserializeToObject (4)\n +- * Project (3)\n +- * Project (2)\n +- MicroBatchScan (1)\n\n\n(1) MicroBatchScan\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nArguments: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@7e7b182c, KafkaV2[Subscribe[test5]], {\"test5\":{\"0\":48279}}, {\"test5\":{\"0\":48642}}\n\n(2) Project [codegen id : 1]\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(3) Project [codegen id : 1]\nOutput [1]: [cast(value#8 as string) AS value#21]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(4) DeserializeToObject\nInput [1]: [value#21]\nArguments: value#21.toString, obj#27: java.lang.String\n\n(5) MapPartitions\nInput [1]: [obj#27]\nArguments: org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String\n\n(6) SerializeFromObject [codegen id : 2]\nInput [1]: [obj#28]\nArguments: [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]\n\n(7) HashAggregate [codegen id : 2]\nInput [1]: [value#29]\nKeys [1]: [value#29]\nFunctions [1]: [partial_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(8) HashAggregate [codegen id : 2]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(9) Exchange\nInput [2]: [value#29, count#38L]\nArguments: hashpartitioning(value#29, 2), true, [id=#373]\n\n(10) StateStoreRestore\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 1fb6b6c6-ced8-4f85-80af-1f3f4c424457, opId = 0, ver = 0, numPartitions = 2], 2\n\n(11) HashAggregate [codegen id : 3]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(12) StateStoreSave\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 1fb6b6c6-ced8-4f85-80af-1f3f4c424457, opId = 0, ver = 0, numPartitions = 2], Append, 0, 2\n\n(13) HashAggregate [codegen id : 4]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count(1)#31L AS count#32L]\n\n(14) WriteToDataSourceV2\nInput [2]: [value#29, count#32L]\nArguments: org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@3a1eb73c\n\n","sparkPlanInfo":{"nodeName":"WriteToDataSourceV2","simpleString":"WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@3a1eb73c","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreSave","simpleString":"StateStoreSave [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 1, numPartitions = 2], Complete, 0, 2","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreRestore","simpleString":"StateStoreRestore [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 1, numPartitions = 2], 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(value#29, 2), true, [id=#297]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[partial_count(1)])","children":[{"nodeName":"SerializeFromObject","simpleString":"SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MapPartitions","simpleString":"MapPartitions org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String","children":[{"nodeName":"DeserializeToObject","simpleString":"DeserializeToObject value#21.toString, obj#27: java.lang.String","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [cast(value#8 as string) AS value#21]","children":[{"nodeName":"Project","simpleString":"Project [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MicroBatchScan","simpleString":"MicroBatchScan[key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13] class org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":237,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":236,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":233,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":234,"metricType":"timing"},{"name":"peak memory","accumulatorId":232,"metricType":"size"},{"name":"number of output rows","accumulatorId":231,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":235,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":228,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":229,"metricType":"timing"},{"name":"peak memory","accumulatorId":227,"metricType":"size"},{"name":"number of output rows","accumulatorId":226,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":230,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":225,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":177,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":178,"metricType":"nsTiming"},{"name":"records read","accumulatorId":175,"metricType":"sum"},{"name":"local bytes read","accumulatorId":173,"metricType":"size"},{"name":"fetch wait time","accumulatorId":174,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":171,"metricType":"size"},{"name":"local blocks read","accumulatorId":170,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":169,"metricType":"sum"},{"name":"data size","accumulatorId":168,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":172,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":176,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":224,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":221,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":222,"metricType":"timing"},{"name":"peak memory","accumulatorId":220,"metricType":"size"},{"name":"number of output rows","accumulatorId":219,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":223,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":218,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of inputs which are later than watermark ('inputs' are relative to operators)","accumulatorId":208,"metricType":"sum"},{"name":"number of total state rows","accumulatorId":209,"metricType":"sum"},{"name":"memory used by state","accumulatorId":214,"metricType":"size"},{"name":"count of cache hit on states cache in provider","accumulatorId":216,"metricType":"sum"},{"name":"number of output rows","accumulatorId":207,"metricType":"sum"},{"name":"estimated size of state only on current version","accumulatorId":215,"metricType":"size"},{"name":"count of cache miss on states cache in provider","accumulatorId":217,"metricType":"sum"},{"name":"time to commit changes","accumulatorId":213,"metricType":"timing"},{"name":"time to remove","accumulatorId":212,"metricType":"timing"},{"name":"number of updated state rows","accumulatorId":210,"metricType":"sum"},{"name":"time to update","accumulatorId":211,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":204,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":205,"metricType":"timing"},{"name":"peak memory","accumulatorId":203,"metricType":"size"},{"name":"number of output rows","accumulatorId":202,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":206,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":201,"metricType":"timing"}]}],"metadata":{},"metrics":[]},"time":1596020223333} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":4,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 1","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nWriteToDataSourceV2 (14)\n+- * HashAggregate (13)\n +- StateStoreSave (12)\n +- * HashAggregate (11)\n +- StateStoreRestore (10)\n +- Exchange (9)\n +- * HashAggregate (8)\n +- * HashAggregate (7)\n +- * SerializeFromObject (6)\n +- MapPartitions (5)\n +- DeserializeToObject (4)\n +- * Project (3)\n +- * Project (2)\n +- MicroBatchScan (1)\n\n\n(1) MicroBatchScan\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nArguments: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@7e7b182c, KafkaV2[Subscribe[test5]], {\"test5\":{\"0\":48279}}, {\"test5\":{\"0\":48642}}\n\n(2) Project [codegen id : 1]\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(3) Project [codegen id : 1]\nOutput [1]: [cast(value#8 as string) AS value#21]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(4) DeserializeToObject\nInput [1]: [value#21]\nArguments: value#21.toString, obj#27: java.lang.String\n\n(5) MapPartitions\nInput [1]: [obj#27]\nArguments: org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String\n\n(6) SerializeFromObject [codegen id : 2]\nInput [1]: [obj#28]\nArguments: [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]\n\n(7) HashAggregate [codegen id : 2]\nInput [1]: [value#29]\nKeys [1]: [value#29]\nFunctions [1]: [partial_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(8) HashAggregate [codegen id : 2]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(9) Exchange\nInput [2]: [value#29, count#38L]\nArguments: hashpartitioning(value#29, 2), true, [id=#449]\n\n(10) StateStoreRestore\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 7992c0a8-0641-440d-aaf7-ad453fe25c0a, opId = 0, ver = 0, numPartitions = 2], 2\n\n(11) HashAggregate [codegen id : 3]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(12) StateStoreSave\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 7992c0a8-0641-440d-aaf7-ad453fe25c0a, opId = 0, ver = 0, numPartitions = 2], Append, 0, 2\n\n(13) HashAggregate [codegen id : 4]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count(1)#31L AS count#32L]\n\n(14) WriteToDataSourceV2\nInput [2]: [value#29, count#32L]\nArguments: org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@3a1eb73c\n\n","sparkPlanInfo":{"nodeName":"WriteToDataSourceV2","simpleString":"WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@3a1eb73c","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreSave","simpleString":"StateStoreSave [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 1, numPartitions = 2], Complete, 0, 2","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreRestore","simpleString":"StateStoreRestore [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 1, numPartitions = 2], 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(value#29, 2), true, [id=#297]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[partial_count(1)])","children":[{"nodeName":"SerializeFromObject","simpleString":"SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MapPartitions","simpleString":"MapPartitions org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String","children":[{"nodeName":"DeserializeToObject","simpleString":"DeserializeToObject value#21.toString, obj#27: java.lang.String","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [cast(value#8 as string) AS value#21]","children":[{"nodeName":"Project","simpleString":"Project [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MicroBatchScan","simpleString":"MicroBatchScan[key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13] class org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":237,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":236,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":233,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":234,"metricType":"timing"},{"name":"peak memory","accumulatorId":232,"metricType":"size"},{"name":"number of output rows","accumulatorId":231,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":235,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":228,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":229,"metricType":"timing"},{"name":"peak memory","accumulatorId":227,"metricType":"size"},{"name":"number of output rows","accumulatorId":226,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":230,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":225,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":177,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":178,"metricType":"nsTiming"},{"name":"records read","accumulatorId":175,"metricType":"sum"},{"name":"local bytes read","accumulatorId":173,"metricType":"size"},{"name":"fetch wait time","accumulatorId":174,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":171,"metricType":"size"},{"name":"local blocks read","accumulatorId":170,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":169,"metricType":"sum"},{"name":"data size","accumulatorId":168,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":172,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":176,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":224,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":221,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":222,"metricType":"timing"},{"name":"peak memory","accumulatorId":220,"metricType":"size"},{"name":"number of output rows","accumulatorId":219,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":223,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":218,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of inputs which are later than watermark ('inputs' are relative to operators)","accumulatorId":208,"metricType":"sum"},{"name":"number of total state rows","accumulatorId":209,"metricType":"sum"},{"name":"memory used by state","accumulatorId":214,"metricType":"size"},{"name":"count of cache hit on states cache in provider","accumulatorId":216,"metricType":"sum"},{"name":"number of output rows","accumulatorId":207,"metricType":"sum"},{"name":"estimated size of state only on current version","accumulatorId":215,"metricType":"size"},{"name":"count of cache miss on states cache in provider","accumulatorId":217,"metricType":"sum"},{"name":"time to commit changes","accumulatorId":213,"metricType":"timing"},{"name":"time to remove","accumulatorId":212,"metricType":"timing"},{"name":"number of updated state rows","accumulatorId":210,"metricType":"sum"},{"name":"time to update","accumulatorId":211,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":204,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":205,"metricType":"timing"},{"name":"peak memory","accumulatorId":203,"metricType":"size"},{"name":"number of output rows","accumulatorId":202,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":206,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":201,"metricType":"timing"}]}],"metadata":{},"metrics":[]},"time":1596020223382} +{"Event":"SparkListenerJobStart","Job ID":1,"Submission Time":1596020223482,"Stage Infos":[{"Stage ID":2,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":18,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"41\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[17],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":12,"Name":"DataSourceRDD","Scope":"{\"id\":\"53\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":13,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"53\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[12],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":15,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"48\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[14],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":14,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"49\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[13],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":16,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"47\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[15],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":17,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"42\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[16],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Accumulables":[],"Resource Profile Id":0},{"Stage ID":3,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":23,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"33\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[22],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":22,"Name":"StateStoreRDD","Scope":"{\"id\":\"36\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[21],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":19,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"41\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[18],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":21,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"37\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[20],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":20,"Name":"StateStoreRDD","Scope":"{\"id\":\"40\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[19],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[2],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Accumulables":[],"Resource Profile Id":0}],"Stage IDs":[2,3],"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 1","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"1","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"4","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":2,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":18,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"41\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[17],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":12,"Name":"DataSourceRDD","Scope":"{\"id\":\"53\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":13,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"53\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[12],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":15,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"48\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[14],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":14,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"49\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[13],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":16,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"47\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[15],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":17,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"42\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[16],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020223485,"Accumulables":[],"Resource Profile Id":0},"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 1","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"1","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"4","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":2,"Stage Attempt ID":0,"Task Info":{"Task ID":3,"Index":0,"Attempt":0,"Launch Time":1596020223493,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":2,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":3,"Index":0,"Attempt":0,"Launch Time":1596020223493,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020223601,"Failed":false,"Killed":false,"Accumulables":[{"ID":178,"Name":"shuffle write time","Update":"837580","Value":"837580","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":177,"Name":"shuffle records written","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":176,"Name":"shuffle bytes written","Update":"169","Value":"169","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":168,"Name":"data size","Update":"128","Value":"128","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":225,"Name":"duration","Update":"84","Value":"84","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":226,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":227,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":229,"Name":"time in aggregation build","Update":"74","Value":"74","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":231,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":232,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":234,"Name":"time in aggregation build","Update":"68","Value":"68","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":236,"Name":"duration","Update":"84","Value":"84","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":237,"Name":"number of output rows","Update":"363","Value":"363","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":282,"Name":"internal.metrics.input.recordsRead","Update":363,"Value":363,"Internal":true,"Count Failed Values":true},{"ID":280,"Name":"internal.metrics.shuffle.write.writeTime","Update":837580,"Value":837580,"Internal":true,"Count Failed Values":true},{"ID":279,"Name":"internal.metrics.shuffle.write.recordsWritten","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":278,"Name":"internal.metrics.shuffle.write.bytesWritten","Update":169,"Value":169,"Internal":true,"Count Failed Values":true},{"ID":269,"Name":"internal.metrics.peakExecutionMemory","Update":524288,"Value":524288,"Internal":true,"Count Failed Values":true},{"ID":264,"Name":"internal.metrics.resultSize","Update":2544,"Value":2544,"Internal":true,"Count Failed Values":true},{"ID":263,"Name":"internal.metrics.executorCpuTime","Update":95945587,"Value":95945587,"Internal":true,"Count Failed Values":true},{"ID":262,"Name":"internal.metrics.executorRunTime","Update":96,"Value":96,"Internal":true,"Count Failed Values":true},{"ID":261,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7437557,"Value":7437557,"Internal":true,"Count Failed Values":true},{"ID":260,"Name":"internal.metrics.executorDeserializeTime","Update":7,"Value":7,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":7,"Executor Deserialize CPU Time":7437557,"Executor Run Time":96,"Executor CPU Time":95945587,"Peak Execution Memory":524288,"Result Size":2544,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":169,"Shuffle Write Time":837580,"Shuffle Records Written":1},"Input Metrics":{"Bytes Read":0,"Records Read":363},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":2,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":18,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"41\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[17],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":12,"Name":"DataSourceRDD","Scope":"{\"id\":\"53\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":13,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"53\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[12],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":15,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"48\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[14],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":14,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"49\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[13],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":16,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"47\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[15],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":17,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"42\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[16],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020223485,"Completion Time":1596020223603,"Accumulables":[{"ID":227,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":236,"Name":"duration","Value":"84","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":176,"Name":"shuffle bytes written","Value":"169","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":262,"Name":"internal.metrics.executorRunTime","Value":96,"Internal":true,"Count Failed Values":true},{"ID":226,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":280,"Name":"internal.metrics.shuffle.write.writeTime","Value":837580,"Internal":true,"Count Failed Values":true},{"ID":229,"Name":"time in aggregation build","Value":"74","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":232,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":178,"Name":"shuffle write time","Value":"837580","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":225,"Name":"duration","Value":"84","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":261,"Name":"internal.metrics.executorDeserializeCpuTime","Value":7437557,"Internal":true,"Count Failed Values":true},{"ID":279,"Name":"internal.metrics.shuffle.write.recordsWritten","Value":1,"Internal":true,"Count Failed Values":true},{"ID":234,"Name":"time in aggregation build","Value":"68","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":264,"Name":"internal.metrics.resultSize","Value":2544,"Internal":true,"Count Failed Values":true},{"ID":282,"Name":"internal.metrics.input.recordsRead","Value":363,"Internal":true,"Count Failed Values":true},{"ID":237,"Name":"number of output rows","Value":"363","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":177,"Name":"shuffle records written","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":168,"Name":"data size","Value":"128","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":231,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":263,"Name":"internal.metrics.executorCpuTime","Value":95945587,"Internal":true,"Count Failed Values":true},{"ID":260,"Name":"internal.metrics.executorDeserializeTime","Value":7,"Internal":true,"Count Failed Values":true},{"ID":269,"Name":"internal.metrics.peakExecutionMemory","Value":524288,"Internal":true,"Count Failed Values":true},{"ID":278,"Name":"internal.metrics.shuffle.write.bytesWritten","Value":169,"Internal":true,"Count Failed Values":true}],"Resource Profile Id":0}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":3,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":23,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"33\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[22],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":22,"Name":"StateStoreRDD","Scope":"{\"id\":\"36\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[21],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":19,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"41\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[18],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":21,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"37\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[20],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":20,"Name":"StateStoreRDD","Scope":"{\"id\":\"40\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[19],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[2],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020223613,"Accumulables":[],"Resource Profile Id":0},"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 1","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"1","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"4","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":3,"Stage Attempt ID":0,"Task Info":{"Task ID":4,"Index":0,"Attempt":0,"Launch Time":1596020223625,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":3,"Stage Attempt ID":0,"Task Info":{"Task ID":5,"Index":1,"Attempt":0,"Launch Time":1596020223626,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":3,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":4,"Index":0,"Attempt":0,"Launch Time":1596020223625,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020223717,"Failed":false,"Killed":false,"Accumulables":[{"ID":201,"Name":"duration","Update":"4","Value":"4","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":203,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":205,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":211,"Name":"time to update","Update":"6","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":212,"Name":"time to remove","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":213,"Name":"time to commit changes","Update":"38","Value":"38","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":215,"Name":"estimated size of state only on current version","Update":"88","Value":"88","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":216,"Name":"count of cache hit on states cache in provider","Update":"2","Value":"2","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":214,"Name":"memory used by state","Update":"376","Value":"376","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":218,"Name":"duration","Update":"6","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":220,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":222,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":302,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":301,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":300,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":299,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":298,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":297,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":296,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":294,"Name":"internal.metrics.peakExecutionMemory","Update":524288,"Value":524288,"Internal":true,"Count Failed Values":true},{"ID":289,"Name":"internal.metrics.resultSize","Update":5311,"Value":5311,"Internal":true,"Count Failed Values":true},{"ID":288,"Name":"internal.metrics.executorCpuTime","Update":22954307,"Value":22954307,"Internal":true,"Count Failed Values":true},{"ID":287,"Name":"internal.metrics.executorRunTime","Update":77,"Value":77,"Internal":true,"Count Failed Values":true},{"ID":286,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6627382,"Value":6627382,"Internal":true,"Count Failed Values":true},{"ID":285,"Name":"internal.metrics.executorDeserializeTime","Update":6,"Value":6,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":6,"Executor Deserialize CPU Time":6627382,"Executor Run Time":77,"Executor CPU Time":22954307,"Peak Execution Memory":524288,"Result Size":5311,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":3,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":5,"Index":1,"Attempt":0,"Launch Time":1596020223626,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020223720,"Failed":false,"Killed":false,"Accumulables":[{"ID":201,"Name":"duration","Update":"4","Value":"8","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":206,"Name":"avg hash probe bucket list iters","Update":"10","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":202,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":203,"Name":"peak memory","Update":"4456448","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":205,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":211,"Name":"time to update","Update":"18","Value":"24","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":210,"Name":"number of updated state rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":212,"Name":"time to remove","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":213,"Name":"time to commit changes","Update":"30","Value":"68","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":215,"Name":"estimated size of state only on current version","Update":"368","Value":"456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":207,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":216,"Name":"count of cache hit on states cache in provider","Update":"2","Value":"4","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":214,"Name":"memory used by state","Update":"840","Value":"1216","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":209,"Name":"number of total state rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":218,"Name":"duration","Update":"19","Value":"25","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":219,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":220,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":222,"Name":"time in aggregation build","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":224,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":170,"Name":"local blocks read","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":174,"Name":"fetch wait time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":173,"Name":"local bytes read","Update":"169","Value":"169","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":175,"Name":"records read","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":302,"Name":"internal.metrics.shuffle.read.recordsRead","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":301,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":300,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":169,"Value":169,"Internal":true,"Count Failed Values":true},{"ID":299,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":298,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":297,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":296,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":294,"Name":"internal.metrics.peakExecutionMemory","Update":4718592,"Value":5242880,"Internal":true,"Count Failed Values":true},{"ID":289,"Name":"internal.metrics.resultSize","Update":5574,"Value":10885,"Internal":true,"Count Failed Values":true},{"ID":288,"Name":"internal.metrics.executorCpuTime","Update":25907369,"Value":48861676,"Internal":true,"Count Failed Values":true},{"ID":287,"Name":"internal.metrics.executorRunTime","Update":82,"Value":159,"Internal":true,"Count Failed Values":true},{"ID":286,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7573630,"Value":14201012,"Internal":true,"Count Failed Values":true},{"ID":285,"Name":"internal.metrics.executorDeserializeTime","Update":7,"Value":13,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":7,"Executor Deserialize CPU Time":7573630,"Executor Run Time":82,"Executor CPU Time":25907369,"Peak Execution Memory":4718592,"Result Size":5574,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":1,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":169,"Total Records Read":1},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":3,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":23,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"33\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[22],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":22,"Name":"StateStoreRDD","Scope":"{\"id\":\"36\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[21],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":19,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"41\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[18],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":21,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"37\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[20],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":20,"Name":"StateStoreRDD","Scope":"{\"id\":\"40\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[19],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[2],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020223613,"Completion Time":1596020223724,"Accumulables":[{"ID":218,"Name":"duration","Value":"25","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":209,"Name":"number of total state rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":173,"Name":"local bytes read","Value":"169","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":286,"Name":"internal.metrics.executorDeserializeCpuTime","Value":14201012,"Internal":true,"Count Failed Values":true},{"ID":298,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":289,"Name":"internal.metrics.resultSize","Value":10885,"Internal":true,"Count Failed Values":true},{"ID":301,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":175,"Name":"records read","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":211,"Name":"time to update","Value":"24","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":202,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":220,"Name":"peak memory","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":214,"Name":"memory used by state","Value":"1216","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":205,"Name":"time in aggregation build","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":300,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":169,"Internal":true,"Count Failed Values":true},{"ID":294,"Name":"internal.metrics.peakExecutionMemory","Value":5242880,"Internal":true,"Count Failed Values":true},{"ID":285,"Name":"internal.metrics.executorDeserializeTime","Value":13,"Internal":true,"Count Failed Values":true},{"ID":207,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":297,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":1,"Internal":true,"Count Failed Values":true},{"ID":288,"Name":"internal.metrics.executorCpuTime","Value":48861676,"Internal":true,"Count Failed Values":true},{"ID":216,"Name":"count of cache hit on states cache in provider","Value":"4","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":174,"Name":"fetch wait time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":210,"Name":"number of updated state rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":219,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":201,"Name":"duration","Value":"8","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":222,"Name":"time in aggregation build","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":213,"Name":"time to commit changes","Value":"68","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":299,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":302,"Name":"internal.metrics.shuffle.read.recordsRead","Value":1,"Internal":true,"Count Failed Values":true},{"ID":212,"Name":"time to remove","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":203,"Name":"peak memory","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":170,"Name":"local blocks read","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":215,"Name":"estimated size of state only on current version","Value":"456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":287,"Name":"internal.metrics.executorRunTime","Value":159,"Internal":true,"Count Failed Values":true},{"ID":206,"Name":"avg hash probe bucket list iters","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":224,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":296,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true}],"Resource Profile Id":0}} +{"Event":"SparkListenerJobEnd","Job ID":1,"Completion Time":1596020223725,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":5,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 1","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nLocalTableScan (1)\n\n\n(1) LocalTableScan\nOutput [2]: [value#60, count#61]\nArguments: [value#60, count#61]\n\n","sparkPlanInfo":{"nodeName":"LocalTableScan","simpleString":"LocalTableScan [value#60, count#61]","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":310,"metricType":"sum"}]},"time":1596020223752} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":5,"time":1596020223761} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":4,"time":1596020223762} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":3,"time":1596020223762} +{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent","progress":{"id":"8d268dc2-bc9c-4be8-97a9-b135d2943028","runId":"e225d92f-2545-48f8-87a2-9c0309580f8a","name":null,"timestamp":"2020-07-29T10:57:03.168Z","batchId":1,"batchDuration":622,"durationMs":{"triggerExecution":622,"queryPlanning":47,"getBatch":0,"latestOffset":7,"addBatch":478,"walCommit":59},"eventTime":{},"stateOperators":[{"numRowsTotal":1,"numRowsUpdated":1,"memoryUsedBytes":1216,"numLateInputs":0,"customMetrics":{"stateOnCurrentVersionSizeBytes":456,"loadedMapCacheHitCount":4,"loadedMapCacheMissCount":0}}],"sources":[{"description":"KafkaV2[Subscribe[test5]]","startOffset":"{\"test5\":{\"0\":48279}}","endOffset":"{\"test5\":{\"0\":48642}}","numInputRows":363,"inputRowsPerSecond":50.74793792814204,"processedRowsPerSecond":583.6012861736334}],"sink":{"description":"org.apache.spark.sql.execution.streaming.ConsoleTable$@514ba885","numOutputRows":1},"observedMetrics":{}}} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":6,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 2","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nWriteToDataSourceV2 (14)\n+- * HashAggregate (13)\n +- StateStoreSave (12)\n +- * HashAggregate (11)\n +- StateStoreRestore (10)\n +- Exchange (9)\n +- * HashAggregate (8)\n +- * HashAggregate (7)\n +- * SerializeFromObject (6)\n +- MapPartitions (5)\n +- DeserializeToObject (4)\n +- * Project (3)\n +- * Project (2)\n +- MicroBatchScan (1)\n\n\n(1) MicroBatchScan\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nArguments: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@7e7b182c, KafkaV2[Subscribe[test5]], {\"test5\":{\"0\":48642}}, {\"test5\":{\"0\":48705}}\n\n(2) Project [codegen id : 1]\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(3) Project [codegen id : 1]\nOutput [1]: [cast(value#8 as string) AS value#21]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(4) DeserializeToObject\nInput [1]: [value#21]\nArguments: value#21.toString, obj#27: java.lang.String\n\n(5) MapPartitions\nInput [1]: [obj#27]\nArguments: org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String\n\n(6) SerializeFromObject [codegen id : 2]\nInput [1]: [obj#28]\nArguments: [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]\n\n(7) HashAggregate [codegen id : 2]\nInput [1]: [value#29]\nKeys [1]: [value#29]\nFunctions [1]: [partial_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(8) HashAggregate [codegen id : 2]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(9) Exchange\nInput [2]: [value#29, count#38L]\nArguments: hashpartitioning(value#29, 2), true, [id=#604]\n\n(10) StateStoreRestore\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 39c861a0-0e30-4ca2-b363-495aff0f3f93, opId = 0, ver = 0, numPartitions = 2], 2\n\n(11) HashAggregate [codegen id : 3]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(12) StateStoreSave\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 39c861a0-0e30-4ca2-b363-495aff0f3f93, opId = 0, ver = 0, numPartitions = 2], Append, 0, 2\n\n(13) HashAggregate [codegen id : 4]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count(1)#31L AS count#32L]\n\n(14) WriteToDataSourceV2\nInput [2]: [value#29, count#32L]\nArguments: org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@52d6c50a\n\n","sparkPlanInfo":{"nodeName":"WriteToDataSourceV2","simpleString":"WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@52d6c50a","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreSave","simpleString":"StateStoreSave [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 2, numPartitions = 2], Complete, 0, 2","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreRestore","simpleString":"StateStoreRestore [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 2, numPartitions = 2], 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(value#29, 2), true, [id=#528]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[partial_count(1)])","children":[{"nodeName":"SerializeFromObject","simpleString":"SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MapPartitions","simpleString":"MapPartitions org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String","children":[{"nodeName":"DeserializeToObject","simpleString":"DeserializeToObject value#21.toString, obj#27: java.lang.String","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [cast(value#8 as string) AS value#21]","children":[{"nodeName":"Project","simpleString":"Project [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MicroBatchScan","simpleString":"MicroBatchScan[key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13] class org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":394,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":393,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":390,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":391,"metricType":"timing"},{"name":"peak memory","accumulatorId":389,"metricType":"size"},{"name":"number of output rows","accumulatorId":388,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":392,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":385,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":386,"metricType":"timing"},{"name":"peak memory","accumulatorId":384,"metricType":"size"},{"name":"number of output rows","accumulatorId":383,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":387,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":382,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":334,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":335,"metricType":"nsTiming"},{"name":"records read","accumulatorId":332,"metricType":"sum"},{"name":"local bytes read","accumulatorId":330,"metricType":"size"},{"name":"fetch wait time","accumulatorId":331,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":328,"metricType":"size"},{"name":"local blocks read","accumulatorId":327,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":326,"metricType":"sum"},{"name":"data size","accumulatorId":325,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":329,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":333,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":381,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":378,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":379,"metricType":"timing"},{"name":"peak memory","accumulatorId":377,"metricType":"size"},{"name":"number of output rows","accumulatorId":376,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":380,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":375,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of inputs which are later than watermark ('inputs' are relative to operators)","accumulatorId":365,"metricType":"sum"},{"name":"number of total state rows","accumulatorId":366,"metricType":"sum"},{"name":"memory used by state","accumulatorId":371,"metricType":"size"},{"name":"count of cache hit on states cache in provider","accumulatorId":373,"metricType":"sum"},{"name":"number of output rows","accumulatorId":364,"metricType":"sum"},{"name":"estimated size of state only on current version","accumulatorId":372,"metricType":"size"},{"name":"count of cache miss on states cache in provider","accumulatorId":374,"metricType":"sum"},{"name":"time to commit changes","accumulatorId":370,"metricType":"timing"},{"name":"time to remove","accumulatorId":369,"metricType":"timing"},{"name":"number of updated state rows","accumulatorId":367,"metricType":"sum"},{"name":"time to update","accumulatorId":368,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":361,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":362,"metricType":"timing"},{"name":"peak memory","accumulatorId":360,"metricType":"size"},{"name":"number of output rows","accumulatorId":359,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":363,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":358,"metricType":"timing"}]}],"metadata":{},"metrics":[]},"time":1596020223909} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":7,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 2","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nWriteToDataSourceV2 (14)\n+- * HashAggregate (13)\n +- StateStoreSave (12)\n +- * HashAggregate (11)\n +- StateStoreRestore (10)\n +- Exchange (9)\n +- * HashAggregate (8)\n +- * HashAggregate (7)\n +- * SerializeFromObject (6)\n +- MapPartitions (5)\n +- DeserializeToObject (4)\n +- * Project (3)\n +- * Project (2)\n +- MicroBatchScan (1)\n\n\n(1) MicroBatchScan\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nArguments: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@7e7b182c, KafkaV2[Subscribe[test5]], {\"test5\":{\"0\":48642}}, {\"test5\":{\"0\":48705}}\n\n(2) Project [codegen id : 1]\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(3) Project [codegen id : 1]\nOutput [1]: [cast(value#8 as string) AS value#21]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(4) DeserializeToObject\nInput [1]: [value#21]\nArguments: value#21.toString, obj#27: java.lang.String\n\n(5) MapPartitions\nInput [1]: [obj#27]\nArguments: org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String\n\n(6) SerializeFromObject [codegen id : 2]\nInput [1]: [obj#28]\nArguments: [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]\n\n(7) HashAggregate [codegen id : 2]\nInput [1]: [value#29]\nKeys [1]: [value#29]\nFunctions [1]: [partial_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(8) HashAggregate [codegen id : 2]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(9) Exchange\nInput [2]: [value#29, count#38L]\nArguments: hashpartitioning(value#29, 2), true, [id=#680]\n\n(10) StateStoreRestore\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = c2fd3b95-1ba6-4d3e-8b9c-0256dfd90973, opId = 0, ver = 0, numPartitions = 2], 2\n\n(11) HashAggregate [codegen id : 3]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(12) StateStoreSave\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = c2fd3b95-1ba6-4d3e-8b9c-0256dfd90973, opId = 0, ver = 0, numPartitions = 2], Append, 0, 2\n\n(13) HashAggregate [codegen id : 4]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count(1)#31L AS count#32L]\n\n(14) WriteToDataSourceV2\nInput [2]: [value#29, count#32L]\nArguments: org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@52d6c50a\n\n","sparkPlanInfo":{"nodeName":"WriteToDataSourceV2","simpleString":"WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@52d6c50a","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreSave","simpleString":"StateStoreSave [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 2, numPartitions = 2], Complete, 0, 2","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreRestore","simpleString":"StateStoreRestore [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 2, numPartitions = 2], 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(value#29, 2), true, [id=#528]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[partial_count(1)])","children":[{"nodeName":"SerializeFromObject","simpleString":"SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MapPartitions","simpleString":"MapPartitions org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String","children":[{"nodeName":"DeserializeToObject","simpleString":"DeserializeToObject value#21.toString, obj#27: java.lang.String","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [cast(value#8 as string) AS value#21]","children":[{"nodeName":"Project","simpleString":"Project [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MicroBatchScan","simpleString":"MicroBatchScan[key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13] class org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":394,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":393,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":390,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":391,"metricType":"timing"},{"name":"peak memory","accumulatorId":389,"metricType":"size"},{"name":"number of output rows","accumulatorId":388,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":392,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":385,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":386,"metricType":"timing"},{"name":"peak memory","accumulatorId":384,"metricType":"size"},{"name":"number of output rows","accumulatorId":383,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":387,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":382,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":334,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":335,"metricType":"nsTiming"},{"name":"records read","accumulatorId":332,"metricType":"sum"},{"name":"local bytes read","accumulatorId":330,"metricType":"size"},{"name":"fetch wait time","accumulatorId":331,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":328,"metricType":"size"},{"name":"local blocks read","accumulatorId":327,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":326,"metricType":"sum"},{"name":"data size","accumulatorId":325,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":329,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":333,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":381,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":378,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":379,"metricType":"timing"},{"name":"peak memory","accumulatorId":377,"metricType":"size"},{"name":"number of output rows","accumulatorId":376,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":380,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":375,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of inputs which are later than watermark ('inputs' are relative to operators)","accumulatorId":365,"metricType":"sum"},{"name":"number of total state rows","accumulatorId":366,"metricType":"sum"},{"name":"memory used by state","accumulatorId":371,"metricType":"size"},{"name":"count of cache hit on states cache in provider","accumulatorId":373,"metricType":"sum"},{"name":"number of output rows","accumulatorId":364,"metricType":"sum"},{"name":"estimated size of state only on current version","accumulatorId":372,"metricType":"size"},{"name":"count of cache miss on states cache in provider","accumulatorId":374,"metricType":"sum"},{"name":"time to commit changes","accumulatorId":370,"metricType":"timing"},{"name":"time to remove","accumulatorId":369,"metricType":"timing"},{"name":"number of updated state rows","accumulatorId":367,"metricType":"sum"},{"name":"time to update","accumulatorId":368,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":361,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":362,"metricType":"timing"},{"name":"peak memory","accumulatorId":360,"metricType":"size"},{"name":"number of output rows","accumulatorId":359,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":363,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":358,"metricType":"timing"}]}],"metadata":{},"metrics":[]},"time":1596020224006} +{"Event":"SparkListenerJobStart","Job ID":2,"Submission Time":1596020224100,"Stage Infos":[{"Stage ID":5,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":35,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"66\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[34],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":32,"Name":"StateStoreRDD","Scope":"{\"id\":\"73\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[31],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":34,"Name":"StateStoreRDD","Scope":"{\"id\":\"69\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[33],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":33,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"70\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[32],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":31,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"74\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[30],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[4],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Accumulables":[],"Resource Profile Id":0},{"Stage ID":4,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":30,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"74\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[29],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":27,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"81\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[26],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":29,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"75\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[28],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":28,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"80\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[27],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":26,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"82\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[25],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":24,"Name":"DataSourceRDD","Scope":"{\"id\":\"86\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":25,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"86\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[24],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Accumulables":[],"Resource Profile Id":0}],"Stage IDs":[5,4],"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 2","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"2","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"7","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":4,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":30,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"74\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[29],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":27,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"81\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[26],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":29,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"75\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[28],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":28,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"80\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[27],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":26,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"82\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[25],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":24,"Name":"DataSourceRDD","Scope":"{\"id\":\"86\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":25,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"86\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[24],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020224103,"Accumulables":[],"Resource Profile Id":0},"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 2","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"2","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"7","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":4,"Stage Attempt ID":0,"Task Info":{"Task ID":6,"Index":0,"Attempt":0,"Launch Time":1596020224113,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":4,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":6,"Index":0,"Attempt":0,"Launch Time":1596020224113,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020224174,"Failed":false,"Killed":false,"Accumulables":[{"ID":335,"Name":"shuffle write time","Update":"686296","Value":"686296","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":334,"Name":"shuffle records written","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":333,"Name":"shuffle bytes written","Update":"168","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":325,"Name":"data size","Update":"128","Value":"128","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":382,"Name":"duration","Update":"39","Value":"39","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":383,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":384,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":386,"Name":"time in aggregation build","Update":"32","Value":"32","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":388,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":389,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":391,"Name":"time in aggregation build","Update":"26","Value":"26","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":393,"Name":"duration","Update":"40","Value":"40","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":394,"Name":"number of output rows","Update":"63","Value":"63","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":439,"Name":"internal.metrics.input.recordsRead","Update":63,"Value":63,"Internal":true,"Count Failed Values":true},{"ID":437,"Name":"internal.metrics.shuffle.write.writeTime","Update":686296,"Value":686296,"Internal":true,"Count Failed Values":true},{"ID":436,"Name":"internal.metrics.shuffle.write.recordsWritten","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":435,"Name":"internal.metrics.shuffle.write.bytesWritten","Update":168,"Value":168,"Internal":true,"Count Failed Values":true},{"ID":426,"Name":"internal.metrics.peakExecutionMemory","Update":524288,"Value":524288,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.resultSize","Update":2544,"Value":2544,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorCpuTime","Update":33390843,"Value":33390843,"Internal":true,"Count Failed Values":true},{"ID":419,"Name":"internal.metrics.executorRunTime","Update":49,"Value":49,"Internal":true,"Count Failed Values":true},{"ID":418,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4867521,"Value":4867521,"Internal":true,"Count Failed Values":true},{"ID":417,"Name":"internal.metrics.executorDeserializeTime","Update":8,"Value":8,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":8,"Executor Deserialize CPU Time":4867521,"Executor Run Time":49,"Executor CPU Time":33390843,"Peak Execution Memory":524288,"Result Size":2544,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":168,"Shuffle Write Time":686296,"Shuffle Records Written":1},"Input Metrics":{"Bytes Read":0,"Records Read":63},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":4,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":30,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"74\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[29],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":27,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"81\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[26],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":29,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"75\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[28],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":28,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"80\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[27],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":26,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"82\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[25],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":24,"Name":"DataSourceRDD","Scope":"{\"id\":\"86\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":25,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"86\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[24],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020224103,"Completion Time":1596020224175,"Accumulables":[{"ID":436,"Name":"internal.metrics.shuffle.write.recordsWritten","Value":1,"Internal":true,"Count Failed Values":true},{"ID":391,"Name":"time in aggregation build","Value":"26","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":382,"Name":"duration","Value":"39","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"internal.metrics.executorDeserializeCpuTime","Value":4867521,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.resultSize","Value":2544,"Internal":true,"Count Failed Values":true},{"ID":394,"Name":"number of output rows","Value":"63","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":439,"Name":"internal.metrics.input.recordsRead","Value":63,"Internal":true,"Count Failed Values":true},{"ID":388,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":334,"Name":"shuffle records written","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":325,"Name":"data size","Value":"128","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":420,"Name":"internal.metrics.executorCpuTime","Value":33390843,"Internal":true,"Count Failed Values":true},{"ID":426,"Name":"internal.metrics.peakExecutionMemory","Value":524288,"Internal":true,"Count Failed Values":true},{"ID":417,"Name":"internal.metrics.executorDeserializeTime","Value":8,"Internal":true,"Count Failed Values":true},{"ID":435,"Name":"internal.metrics.shuffle.write.bytesWritten","Value":168,"Internal":true,"Count Failed Values":true},{"ID":384,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":393,"Name":"duration","Value":"40","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":333,"Name":"shuffle bytes written","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":383,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":437,"Name":"internal.metrics.shuffle.write.writeTime","Value":686296,"Internal":true,"Count Failed Values":true},{"ID":419,"Name":"internal.metrics.executorRunTime","Value":49,"Internal":true,"Count Failed Values":true},{"ID":386,"Name":"time in aggregation build","Value":"32","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":335,"Name":"shuffle write time","Value":"686296","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":389,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"}],"Resource Profile Id":0}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":5,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":35,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"66\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[34],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":32,"Name":"StateStoreRDD","Scope":"{\"id\":\"73\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[31],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":34,"Name":"StateStoreRDD","Scope":"{\"id\":\"69\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[33],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":33,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"70\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[32],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":31,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"74\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[30],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[4],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020224179,"Accumulables":[],"Resource Profile Id":0},"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 2","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"2","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"7","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":7,"Index":0,"Attempt":0,"Launch Time":1596020224187,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":5,"Stage Attempt ID":0,"Task Info":{"Task ID":8,"Index":1,"Attempt":0,"Launch Time":1596020224187,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":7,"Index":0,"Attempt":0,"Launch Time":1596020224187,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020224256,"Failed":false,"Killed":false,"Accumulables":[{"ID":358,"Name":"duration","Update":"3","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":360,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":362,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":368,"Name":"time to update","Update":"3","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":369,"Name":"time to remove","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":370,"Name":"time to commit changes","Update":"32","Value":"32","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":372,"Name":"estimated size of state only on current version","Update":"88","Value":"88","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":373,"Name":"count of cache hit on states cache in provider","Update":"4","Value":"4","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":371,"Name":"memory used by state","Update":"400","Value":"400","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":375,"Name":"duration","Update":"3","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":377,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":379,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":459,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":458,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":457,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":456,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":455,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":454,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":453,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":451,"Name":"internal.metrics.peakExecutionMemory","Update":524288,"Value":524288,"Internal":true,"Count Failed Values":true},{"ID":446,"Name":"internal.metrics.resultSize","Update":5311,"Value":5311,"Internal":true,"Count Failed Values":true},{"ID":445,"Name":"internal.metrics.executorCpuTime","Update":17230622,"Value":17230622,"Internal":true,"Count Failed Values":true},{"ID":444,"Name":"internal.metrics.executorRunTime","Update":56,"Value":56,"Internal":true,"Count Failed Values":true},{"ID":443,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5948051,"Value":5948051,"Internal":true,"Count Failed Values":true},{"ID":442,"Name":"internal.metrics.executorDeserializeTime","Update":6,"Value":6,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":6,"Executor Deserialize CPU Time":5948051,"Executor Run Time":56,"Executor CPU Time":17230622,"Peak Execution Memory":524288,"Result Size":5311,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":5,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":8,"Index":1,"Attempt":0,"Launch Time":1596020224187,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020224257,"Failed":false,"Killed":false,"Accumulables":[{"ID":358,"Name":"duration","Update":"4","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":363,"Name":"avg hash probe bucket list iters","Update":"10","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":359,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":360,"Name":"peak memory","Update":"4456448","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":362,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":368,"Name":"time to update","Update":"21","Value":"24","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":367,"Name":"number of updated state rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":369,"Name":"time to remove","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":370,"Name":"time to commit changes","Update":"18","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":372,"Name":"estimated size of state only on current version","Update":"368","Value":"456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":364,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":373,"Name":"count of cache hit on states cache in provider","Update":"4","Value":"8","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":371,"Name":"memory used by state","Update":"784","Value":"1184","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":366,"Name":"number of total state rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":375,"Name":"duration","Update":"22","Value":"25","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":376,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":377,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":379,"Name":"time in aggregation build","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":381,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":327,"Name":"local blocks read","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":331,"Name":"fetch wait time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":330,"Name":"local bytes read","Update":"168","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":332,"Name":"records read","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":459,"Name":"internal.metrics.shuffle.read.recordsRead","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":458,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":457,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":168,"Value":168,"Internal":true,"Count Failed Values":true},{"ID":456,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":455,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":454,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":453,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":451,"Name":"internal.metrics.peakExecutionMemory","Update":4718592,"Value":5242880,"Internal":true,"Count Failed Values":true},{"ID":446,"Name":"internal.metrics.resultSize","Update":5574,"Value":10885,"Internal":true,"Count Failed Values":true},{"ID":445,"Name":"internal.metrics.executorCpuTime","Update":23808555,"Value":41039177,"Internal":true,"Count Failed Values":true},{"ID":444,"Name":"internal.metrics.executorRunTime","Update":56,"Value":112,"Internal":true,"Count Failed Values":true},{"ID":443,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6247106,"Value":12195157,"Internal":true,"Count Failed Values":true},{"ID":442,"Name":"internal.metrics.executorDeserializeTime","Update":6,"Value":12,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":6,"Executor Deserialize CPU Time":6247106,"Executor Run Time":56,"Executor CPU Time":23808555,"Peak Execution Memory":4718592,"Result Size":5574,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":1,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":168,"Total Records Read":1},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":5,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":35,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"66\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[34],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":32,"Name":"StateStoreRDD","Scope":"{\"id\":\"73\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[31],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":34,"Name":"StateStoreRDD","Scope":"{\"id\":\"69\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[33],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":33,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"70\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[32],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":31,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"74\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[30],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[4],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020224179,"Completion Time":1596020224259,"Accumulables":[{"ID":442,"Name":"internal.metrics.executorDeserializeTime","Value":12,"Internal":true,"Count Failed Values":true},{"ID":451,"Name":"internal.metrics.peakExecutionMemory","Value":5242880,"Internal":true,"Count Failed Values":true},{"ID":445,"Name":"internal.metrics.executorCpuTime","Value":41039177,"Internal":true,"Count Failed Values":true},{"ID":364,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":454,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":1,"Internal":true,"Count Failed Values":true},{"ID":373,"Name":"count of cache hit on states cache in provider","Value":"8","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":367,"Name":"number of updated state rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":376,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":358,"Name":"duration","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":331,"Name":"fetch wait time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":457,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":168,"Internal":true,"Count Failed Values":true},{"ID":379,"Name":"time in aggregation build","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":370,"Name":"time to commit changes","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":456,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":369,"Name":"time to remove","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":459,"Name":"internal.metrics.shuffle.read.recordsRead","Value":1,"Internal":true,"Count Failed Values":true},{"ID":360,"Name":"peak memory","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":381,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":453,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":372,"Name":"estimated size of state only on current version","Value":"456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":363,"Name":"avg hash probe bucket list iters","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":327,"Name":"local blocks read","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":444,"Name":"internal.metrics.executorRunTime","Value":112,"Internal":true,"Count Failed Values":true},{"ID":375,"Name":"duration","Value":"25","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":366,"Name":"number of total state rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":330,"Name":"local bytes read","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":443,"Name":"internal.metrics.executorDeserializeCpuTime","Value":12195157,"Internal":true,"Count Failed Values":true},{"ID":455,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":446,"Name":"internal.metrics.resultSize","Value":10885,"Internal":true,"Count Failed Values":true},{"ID":332,"Name":"records read","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":377,"Name":"peak memory","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":359,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":458,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":368,"Name":"time to update","Value":"24","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":362,"Name":"time in aggregation build","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":371,"Name":"memory used by state","Value":"1184","Internal":true,"Count Failed Values":true,"Metadata":"sql"}],"Resource Profile Id":0}} +{"Event":"SparkListenerJobEnd","Job ID":2,"Completion Time":1596020224259,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":8,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 2","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nLocalTableScan (1)\n\n\n(1) LocalTableScan\nOutput [2]: [value#74, count#75]\nArguments: [value#74, count#75]\n\n","sparkPlanInfo":{"nodeName":"LocalTableScan","simpleString":"LocalTableScan [value#74, count#75]","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":467,"metricType":"sum"}]},"time":1596020224278} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":8,"time":1596020224287} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":7,"time":1596020224287} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":6,"time":1596020224288} +{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent","progress":{"id":"8d268dc2-bc9c-4be8-97a9-b135d2943028","runId":"e225d92f-2545-48f8-87a2-9c0309580f8a","name":null,"timestamp":"2020-07-29T10:57:03.793Z","batchId":2,"batchDuration":522,"durationMs":{"triggerExecution":522,"queryPlanning":41,"getBatch":1,"latestOffset":3,"addBatch":421,"walCommit":27},"eventTime":{},"stateOperators":[{"numRowsTotal":1,"numRowsUpdated":1,"memoryUsedBytes":1184,"numLateInputs":0,"customMetrics":{"stateOnCurrentVersionSizeBytes":456,"loadedMapCacheHitCount":8,"loadedMapCacheMissCount":0}}],"sources":[{"description":"KafkaV2[Subscribe[test5]]","startOffset":"{\"test5\":{\"0\":48642}}","endOffset":"{\"test5\":{\"0\":48705}}","numInputRows":63,"inputRowsPerSecond":100.8,"processedRowsPerSecond":120.6896551724138}],"sink":{"description":"org.apache.spark.sql.execution.streaming.ConsoleTable$@514ba885","numOutputRows":1},"observedMetrics":{}}} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":9,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 3","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nWriteToDataSourceV2 (14)\n+- * HashAggregate (13)\n +- StateStoreSave (12)\n +- * HashAggregate (11)\n +- StateStoreRestore (10)\n +- Exchange (9)\n +- * HashAggregate (8)\n +- * HashAggregate (7)\n +- * SerializeFromObject (6)\n +- MapPartitions (5)\n +- DeserializeToObject (4)\n +- * Project (3)\n +- * Project (2)\n +- MicroBatchScan (1)\n\n\n(1) MicroBatchScan\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nArguments: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@7e7b182c, KafkaV2[Subscribe[test5]], {\"test5\":{\"0\":48705}}, {\"test5\":{\"0\":48757}}\n\n(2) Project [codegen id : 1]\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(3) Project [codegen id : 1]\nOutput [1]: [cast(value#8 as string) AS value#21]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(4) DeserializeToObject\nInput [1]: [value#21]\nArguments: value#21.toString, obj#27: java.lang.String\n\n(5) MapPartitions\nInput [1]: [obj#27]\nArguments: org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String\n\n(6) SerializeFromObject [codegen id : 2]\nInput [1]: [obj#28]\nArguments: [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]\n\n(7) HashAggregate [codegen id : 2]\nInput [1]: [value#29]\nKeys [1]: [value#29]\nFunctions [1]: [partial_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(8) HashAggregate [codegen id : 2]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(9) Exchange\nInput [2]: [value#29, count#38L]\nArguments: hashpartitioning(value#29, 2), true, [id=#835]\n\n(10) StateStoreRestore\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 8bb5d8a6-42f8-4141-8f25-e1b98f81aac4, opId = 0, ver = 0, numPartitions = 2], 2\n\n(11) HashAggregate [codegen id : 3]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(12) StateStoreSave\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 8bb5d8a6-42f8-4141-8f25-e1b98f81aac4, opId = 0, ver = 0, numPartitions = 2], Append, 0, 2\n\n(13) HashAggregate [codegen id : 4]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count(1)#31L AS count#32L]\n\n(14) WriteToDataSourceV2\nInput [2]: [value#29, count#32L]\nArguments: org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@59b7c509\n\n","sparkPlanInfo":{"nodeName":"WriteToDataSourceV2","simpleString":"WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@59b7c509","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreSave","simpleString":"StateStoreSave [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 3, numPartitions = 2], Complete, 0, 2","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreRestore","simpleString":"StateStoreRestore [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 3, numPartitions = 2], 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(value#29, 2), true, [id=#759]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[partial_count(1)])","children":[{"nodeName":"SerializeFromObject","simpleString":"SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MapPartitions","simpleString":"MapPartitions org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String","children":[{"nodeName":"DeserializeToObject","simpleString":"DeserializeToObject value#21.toString, obj#27: java.lang.String","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [cast(value#8 as string) AS value#21]","children":[{"nodeName":"Project","simpleString":"Project [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MicroBatchScan","simpleString":"MicroBatchScan[key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13] class org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":551,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":550,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":547,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":548,"metricType":"timing"},{"name":"peak memory","accumulatorId":546,"metricType":"size"},{"name":"number of output rows","accumulatorId":545,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":549,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":542,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":543,"metricType":"timing"},{"name":"peak memory","accumulatorId":541,"metricType":"size"},{"name":"number of output rows","accumulatorId":540,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":544,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":539,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":491,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":492,"metricType":"nsTiming"},{"name":"records read","accumulatorId":489,"metricType":"sum"},{"name":"local bytes read","accumulatorId":487,"metricType":"size"},{"name":"fetch wait time","accumulatorId":488,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":485,"metricType":"size"},{"name":"local blocks read","accumulatorId":484,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":483,"metricType":"sum"},{"name":"data size","accumulatorId":482,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":486,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":490,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":538,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":535,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":536,"metricType":"timing"},{"name":"peak memory","accumulatorId":534,"metricType":"size"},{"name":"number of output rows","accumulatorId":533,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":537,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":532,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of inputs which are later than watermark ('inputs' are relative to operators)","accumulatorId":522,"metricType":"sum"},{"name":"number of total state rows","accumulatorId":523,"metricType":"sum"},{"name":"memory used by state","accumulatorId":528,"metricType":"size"},{"name":"count of cache hit on states cache in provider","accumulatorId":530,"metricType":"sum"},{"name":"number of output rows","accumulatorId":521,"metricType":"sum"},{"name":"estimated size of state only on current version","accumulatorId":529,"metricType":"size"},{"name":"count of cache miss on states cache in provider","accumulatorId":531,"metricType":"sum"},{"name":"time to commit changes","accumulatorId":527,"metricType":"timing"},{"name":"time to remove","accumulatorId":526,"metricType":"timing"},{"name":"number of updated state rows","accumulatorId":524,"metricType":"sum"},{"name":"time to update","accumulatorId":525,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":518,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":519,"metricType":"timing"},{"name":"peak memory","accumulatorId":517,"metricType":"size"},{"name":"number of output rows","accumulatorId":516,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":520,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":515,"metricType":"timing"}]}],"metadata":{},"metrics":[]},"time":1596020224419} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":10,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 3","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nWriteToDataSourceV2 (14)\n+- * HashAggregate (13)\n +- StateStoreSave (12)\n +- * HashAggregate (11)\n +- StateStoreRestore (10)\n +- Exchange (9)\n +- * HashAggregate (8)\n +- * HashAggregate (7)\n +- * SerializeFromObject (6)\n +- MapPartitions (5)\n +- DeserializeToObject (4)\n +- * Project (3)\n +- * Project (2)\n +- MicroBatchScan (1)\n\n\n(1) MicroBatchScan\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nArguments: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@7e7b182c, KafkaV2[Subscribe[test5]], {\"test5\":{\"0\":48705}}, {\"test5\":{\"0\":48757}}\n\n(2) Project [codegen id : 1]\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(3) Project [codegen id : 1]\nOutput [1]: [cast(value#8 as string) AS value#21]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(4) DeserializeToObject\nInput [1]: [value#21]\nArguments: value#21.toString, obj#27: java.lang.String\n\n(5) MapPartitions\nInput [1]: [obj#27]\nArguments: org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String\n\n(6) SerializeFromObject [codegen id : 2]\nInput [1]: [obj#28]\nArguments: [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]\n\n(7) HashAggregate [codegen id : 2]\nInput [1]: [value#29]\nKeys [1]: [value#29]\nFunctions [1]: [partial_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(8) HashAggregate [codegen id : 2]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(9) Exchange\nInput [2]: [value#29, count#38L]\nArguments: hashpartitioning(value#29, 2), true, [id=#911]\n\n(10) StateStoreRestore\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 29402d2a-a5da-4bb1-8d1a-c6d1c2d998d5, opId = 0, ver = 0, numPartitions = 2], 2\n\n(11) HashAggregate [codegen id : 3]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(12) StateStoreSave\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 29402d2a-a5da-4bb1-8d1a-c6d1c2d998d5, opId = 0, ver = 0, numPartitions = 2], Append, 0, 2\n\n(13) HashAggregate [codegen id : 4]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count(1)#31L AS count#32L]\n\n(14) WriteToDataSourceV2\nInput [2]: [value#29, count#32L]\nArguments: org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@59b7c509\n\n","sparkPlanInfo":{"nodeName":"WriteToDataSourceV2","simpleString":"WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@59b7c509","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreSave","simpleString":"StateStoreSave [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 3, numPartitions = 2], Complete, 0, 2","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreRestore","simpleString":"StateStoreRestore [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 3, numPartitions = 2], 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(value#29, 2), true, [id=#759]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[partial_count(1)])","children":[{"nodeName":"SerializeFromObject","simpleString":"SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MapPartitions","simpleString":"MapPartitions org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String","children":[{"nodeName":"DeserializeToObject","simpleString":"DeserializeToObject value#21.toString, obj#27: java.lang.String","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [cast(value#8 as string) AS value#21]","children":[{"nodeName":"Project","simpleString":"Project [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MicroBatchScan","simpleString":"MicroBatchScan[key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13] class org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":551,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":550,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":547,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":548,"metricType":"timing"},{"name":"peak memory","accumulatorId":546,"metricType":"size"},{"name":"number of output rows","accumulatorId":545,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":549,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":542,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":543,"metricType":"timing"},{"name":"peak memory","accumulatorId":541,"metricType":"size"},{"name":"number of output rows","accumulatorId":540,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":544,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":539,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":491,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":492,"metricType":"nsTiming"},{"name":"records read","accumulatorId":489,"metricType":"sum"},{"name":"local bytes read","accumulatorId":487,"metricType":"size"},{"name":"fetch wait time","accumulatorId":488,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":485,"metricType":"size"},{"name":"local blocks read","accumulatorId":484,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":483,"metricType":"sum"},{"name":"data size","accumulatorId":482,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":486,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":490,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":538,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":535,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":536,"metricType":"timing"},{"name":"peak memory","accumulatorId":534,"metricType":"size"},{"name":"number of output rows","accumulatorId":533,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":537,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":532,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of inputs which are later than watermark ('inputs' are relative to operators)","accumulatorId":522,"metricType":"sum"},{"name":"number of total state rows","accumulatorId":523,"metricType":"sum"},{"name":"memory used by state","accumulatorId":528,"metricType":"size"},{"name":"count of cache hit on states cache in provider","accumulatorId":530,"metricType":"sum"},{"name":"number of output rows","accumulatorId":521,"metricType":"sum"},{"name":"estimated size of state only on current version","accumulatorId":529,"metricType":"size"},{"name":"count of cache miss on states cache in provider","accumulatorId":531,"metricType":"sum"},{"name":"time to commit changes","accumulatorId":527,"metricType":"timing"},{"name":"time to remove","accumulatorId":526,"metricType":"timing"},{"name":"number of updated state rows","accumulatorId":524,"metricType":"sum"},{"name":"time to update","accumulatorId":525,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":518,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":519,"metricType":"timing"},{"name":"peak memory","accumulatorId":517,"metricType":"size"},{"name":"number of output rows","accumulatorId":516,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":520,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":515,"metricType":"timing"}]}],"metadata":{},"metrics":[]},"time":1596020224452} +{"Event":"SparkListenerJobStart","Job ID":3,"Submission Time":1596020224533,"Stage Infos":[{"Stage ID":6,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":42,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"107\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[41],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":38,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"115\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[37],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":36,"Name":"DataSourceRDD","Scope":"{\"id\":\"119\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":41,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"108\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[40],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":37,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"119\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[36],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":40,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"113\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[39],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":39,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"114\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[38],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Accumulables":[],"Resource Profile Id":0},{"Stage ID":7,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":47,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"99\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[46],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":46,"Name":"StateStoreRDD","Scope":"{\"id\":\"102\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[45],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":45,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"103\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[44],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":44,"Name":"StateStoreRDD","Scope":"{\"id\":\"106\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[43],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":43,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"107\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[42],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[6],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Accumulables":[],"Resource Profile Id":0}],"Stage IDs":[6,7],"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 3","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"3","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"10","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":6,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":42,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"107\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[41],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":38,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"115\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[37],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":36,"Name":"DataSourceRDD","Scope":"{\"id\":\"119\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":41,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"108\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[40],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":37,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"119\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[36],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":40,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"113\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[39],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":39,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"114\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[38],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020224535,"Accumulables":[],"Resource Profile Id":0},"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 3","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"3","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"10","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":6,"Stage Attempt ID":0,"Task Info":{"Task ID":9,"Index":0,"Attempt":0,"Launch Time":1596020224541,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":6,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":9,"Index":0,"Attempt":0,"Launch Time":1596020224541,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020224581,"Failed":false,"Killed":false,"Accumulables":[{"ID":492,"Name":"shuffle write time","Update":"643278","Value":"643278","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":491,"Name":"shuffle records written","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":490,"Name":"shuffle bytes written","Update":"168","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":482,"Name":"data size","Update":"128","Value":"128","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":539,"Name":"duration","Update":"20","Value":"20","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":540,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":541,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":543,"Name":"time in aggregation build","Update":"13","Value":"13","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":545,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":546,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":548,"Name":"time in aggregation build","Update":"9","Value":"9","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":550,"Name":"duration","Update":"20","Value":"20","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":551,"Name":"number of output rows","Update":"52","Value":"52","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":596,"Name":"internal.metrics.input.recordsRead","Update":52,"Value":52,"Internal":true,"Count Failed Values":true},{"ID":594,"Name":"internal.metrics.shuffle.write.writeTime","Update":643278,"Value":643278,"Internal":true,"Count Failed Values":true},{"ID":593,"Name":"internal.metrics.shuffle.write.recordsWritten","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":592,"Name":"internal.metrics.shuffle.write.bytesWritten","Update":168,"Value":168,"Internal":true,"Count Failed Values":true},{"ID":583,"Name":"internal.metrics.peakExecutionMemory","Update":524288,"Value":524288,"Internal":true,"Count Failed Values":true},{"ID":578,"Name":"internal.metrics.resultSize","Update":2544,"Value":2544,"Internal":true,"Count Failed Values":true},{"ID":577,"Name":"internal.metrics.executorCpuTime","Update":29099071,"Value":29099071,"Internal":true,"Count Failed Values":true},{"ID":576,"Name":"internal.metrics.executorRunTime","Update":29,"Value":29,"Internal":true,"Count Failed Values":true},{"ID":575,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3091128,"Value":3091128,"Internal":true,"Count Failed Values":true},{"ID":574,"Name":"internal.metrics.executorDeserializeTime","Update":3,"Value":3,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":3,"Executor Deserialize CPU Time":3091128,"Executor Run Time":29,"Executor CPU Time":29099071,"Peak Execution Memory":524288,"Result Size":2544,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":168,"Shuffle Write Time":643278,"Shuffle Records Written":1},"Input Metrics":{"Bytes Read":0,"Records Read":52},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":6,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":42,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"107\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[41],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":38,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"115\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[37],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":36,"Name":"DataSourceRDD","Scope":"{\"id\":\"119\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":41,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"108\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[40],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":37,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"119\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[36],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":40,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"113\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[39],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":39,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"114\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[38],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020224535,"Completion Time":1596020224582,"Accumulables":[{"ID":550,"Name":"duration","Value":"20","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":541,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":577,"Name":"internal.metrics.executorCpuTime","Value":29099071,"Internal":true,"Count Failed Values":true},{"ID":490,"Name":"shuffle bytes written","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":576,"Name":"internal.metrics.executorRunTime","Value":29,"Internal":true,"Count Failed Values":true},{"ID":540,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":594,"Name":"internal.metrics.shuffle.write.writeTime","Value":643278,"Internal":true,"Count Failed Values":true},{"ID":543,"Name":"time in aggregation build","Value":"13","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":492,"Name":"shuffle write time","Value":"643278","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":546,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":539,"Name":"duration","Value":"20","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":575,"Name":"internal.metrics.executorDeserializeCpuTime","Value":3091128,"Internal":true,"Count Failed Values":true},{"ID":593,"Name":"internal.metrics.shuffle.write.recordsWritten","Value":1,"Internal":true,"Count Failed Values":true},{"ID":548,"Name":"time in aggregation build","Value":"9","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":578,"Name":"internal.metrics.resultSize","Value":2544,"Internal":true,"Count Failed Values":true},{"ID":596,"Name":"internal.metrics.input.recordsRead","Value":52,"Internal":true,"Count Failed Values":true},{"ID":551,"Name":"number of output rows","Value":"52","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":482,"Name":"data size","Value":"128","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":491,"Name":"shuffle records written","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":545,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":592,"Name":"internal.metrics.shuffle.write.bytesWritten","Value":168,"Internal":true,"Count Failed Values":true},{"ID":574,"Name":"internal.metrics.executorDeserializeTime","Value":3,"Internal":true,"Count Failed Values":true},{"ID":583,"Name":"internal.metrics.peakExecutionMemory","Value":524288,"Internal":true,"Count Failed Values":true}],"Resource Profile Id":0}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":7,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":47,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"99\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[46],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":46,"Name":"StateStoreRDD","Scope":"{\"id\":\"102\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[45],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":45,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"103\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[44],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":44,"Name":"StateStoreRDD","Scope":"{\"id\":\"106\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[43],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":43,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"107\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[42],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[6],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020224588,"Accumulables":[],"Resource Profile Id":0},"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 3","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"3","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"10","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":10,"Index":0,"Attempt":0,"Launch Time":1596020224596,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":11,"Index":1,"Attempt":0,"Launch Time":1596020224597,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":10,"Index":0,"Attempt":0,"Launch Time":1596020224596,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020224670,"Failed":false,"Killed":false,"Accumulables":[{"ID":515,"Name":"duration","Update":"3","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":517,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":519,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":525,"Name":"time to update","Update":"5","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":526,"Name":"time to remove","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":527,"Name":"time to commit changes","Update":"27","Value":"27","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":529,"Name":"estimated size of state only on current version","Update":"88","Value":"88","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":530,"Name":"count of cache hit on states cache in provider","Update":"6","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":528,"Name":"memory used by state","Update":"400","Value":"400","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":532,"Name":"duration","Update":"5","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":534,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":536,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":616,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":615,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":614,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":613,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":612,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":611,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":610,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":608,"Name":"internal.metrics.peakExecutionMemory","Update":524288,"Value":524288,"Internal":true,"Count Failed Values":true},{"ID":603,"Name":"internal.metrics.resultSize","Update":5311,"Value":5311,"Internal":true,"Count Failed Values":true},{"ID":602,"Name":"internal.metrics.executorCpuTime","Update":19967906,"Value":19967906,"Internal":true,"Count Failed Values":true},{"ID":601,"Name":"internal.metrics.executorRunTime","Update":62,"Value":62,"Internal":true,"Count Failed Values":true},{"ID":600,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4899567,"Value":4899567,"Internal":true,"Count Failed Values":true},{"ID":599,"Name":"internal.metrics.executorDeserializeTime","Update":4,"Value":4,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":4,"Executor Deserialize CPU Time":4899567,"Executor Run Time":62,"Executor CPU Time":19967906,"Peak Execution Memory":524288,"Result Size":5311,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":11,"Index":1,"Attempt":0,"Launch Time":1596020224597,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020224687,"Failed":false,"Killed":false,"Accumulables":[{"ID":515,"Name":"duration","Update":"4","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":520,"Name":"avg hash probe bucket list iters","Update":"10","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":516,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":517,"Name":"peak memory","Update":"4456448","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":519,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":525,"Name":"time to update","Update":"17","Value":"22","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":524,"Name":"number of updated state rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":526,"Name":"time to remove","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":527,"Name":"time to commit changes","Update":"26","Value":"53","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":529,"Name":"estimated size of state only on current version","Update":"368","Value":"456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":521,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":530,"Name":"count of cache hit on states cache in provider","Update":"6","Value":"12","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":528,"Name":"memory used by state","Update":"784","Value":"1184","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":523,"Name":"number of total state rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":532,"Name":"duration","Update":"17","Value":"22","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":533,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":534,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":536,"Name":"time in aggregation build","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":538,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":484,"Name":"local blocks read","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":488,"Name":"fetch wait time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":487,"Name":"local bytes read","Update":"168","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":489,"Name":"records read","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":616,"Name":"internal.metrics.shuffle.read.recordsRead","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":615,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":614,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":168,"Value":168,"Internal":true,"Count Failed Values":true},{"ID":613,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":612,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":611,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":610,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":608,"Name":"internal.metrics.peakExecutionMemory","Update":4718592,"Value":5242880,"Internal":true,"Count Failed Values":true},{"ID":603,"Name":"internal.metrics.resultSize","Update":5574,"Value":10885,"Internal":true,"Count Failed Values":true},{"ID":602,"Name":"internal.metrics.executorCpuTime","Update":22402538,"Value":42370444,"Internal":true,"Count Failed Values":true},{"ID":601,"Name":"internal.metrics.executorRunTime","Update":79,"Value":141,"Internal":true,"Count Failed Values":true},{"ID":600,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4671511,"Value":9571078,"Internal":true,"Count Failed Values":true},{"ID":599,"Name":"internal.metrics.executorDeserializeTime","Update":4,"Value":8,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":4,"Executor Deserialize CPU Time":4671511,"Executor Run Time":79,"Executor CPU Time":22402538,"Peak Execution Memory":4718592,"Result Size":5574,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":1,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":168,"Total Records Read":1},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":7,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":47,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"99\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[46],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":46,"Name":"StateStoreRDD","Scope":"{\"id\":\"102\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[45],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":45,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"103\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[44],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":44,"Name":"StateStoreRDD","Scope":"{\"id\":\"106\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[43],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":43,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"107\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[42],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[6],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020224588,"Completion Time":1596020224688,"Accumulables":[{"ID":523,"Name":"number of total state rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":532,"Name":"duration","Value":"22","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":487,"Name":"local bytes read","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":517,"Name":"peak memory","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":600,"Name":"internal.metrics.executorDeserializeCpuTime","Value":9571078,"Internal":true,"Count Failed Values":true},{"ID":603,"Name":"internal.metrics.resultSize","Value":10885,"Internal":true,"Count Failed Values":true},{"ID":612,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":516,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":615,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":534,"Name":"peak memory","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":525,"Name":"time to update","Value":"22","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":489,"Name":"records read","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":528,"Name":"memory used by state","Value":"1184","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":519,"Name":"time in aggregation build","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":608,"Name":"internal.metrics.peakExecutionMemory","Value":5242880,"Internal":true,"Count Failed Values":true},{"ID":599,"Name":"internal.metrics.executorDeserializeTime","Value":8,"Internal":true,"Count Failed Values":true},{"ID":521,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":530,"Name":"count of cache hit on states cache in provider","Value":"12","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":611,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":1,"Internal":true,"Count Failed Values":true},{"ID":602,"Name":"internal.metrics.executorCpuTime","Value":42370444,"Internal":true,"Count Failed Values":true},{"ID":488,"Name":"fetch wait time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":515,"Name":"duration","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":524,"Name":"number of updated state rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":533,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":614,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":168,"Internal":true,"Count Failed Values":true},{"ID":536,"Name":"time in aggregation build","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":527,"Name":"time to commit changes","Value":"53","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":613,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":616,"Name":"internal.metrics.shuffle.read.recordsRead","Value":1,"Internal":true,"Count Failed Values":true},{"ID":526,"Name":"time to remove","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":520,"Name":"avg hash probe bucket list iters","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":610,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":601,"Name":"internal.metrics.executorRunTime","Value":141,"Internal":true,"Count Failed Values":true},{"ID":484,"Name":"local blocks read","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":538,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":529,"Name":"estimated size of state only on current version","Value":"456","Internal":true,"Count Failed Values":true,"Metadata":"sql"}],"Resource Profile Id":0}} +{"Event":"SparkListenerJobEnd","Job ID":3,"Completion Time":1596020224689,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":11,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 3","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nLocalTableScan (1)\n\n\n(1) LocalTableScan\nOutput [2]: [value#88, count#89]\nArguments: [value#88, count#89]\n\n","sparkPlanInfo":{"nodeName":"LocalTableScan","simpleString":"LocalTableScan [value#88, count#89]","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":624,"metricType":"sum"}]},"time":1596020224709} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":11,"time":1596020224713} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":10,"time":1596020224714} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":9,"time":1596020224714} +{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent","progress":{"id":"8d268dc2-bc9c-4be8-97a9-b135d2943028","runId":"e225d92f-2545-48f8-87a2-9c0309580f8a","name":null,"timestamp":"2020-07-29T10:57:04.317Z","batchId":3,"batchDuration":415,"durationMs":{"triggerExecution":415,"queryPlanning":38,"getBatch":1,"latestOffset":3,"addBatch":332,"walCommit":21},"eventTime":{},"stateOperators":[{"numRowsTotal":1,"numRowsUpdated":1,"memoryUsedBytes":1184,"numLateInputs":0,"customMetrics":{"stateOnCurrentVersionSizeBytes":456,"loadedMapCacheHitCount":12,"loadedMapCacheMissCount":0}}],"sources":[{"description":"KafkaV2[Subscribe[test5]]","startOffset":"{\"test5\":{\"0\":48705}}","endOffset":"{\"test5\":{\"0\":48757}}","numInputRows":52,"inputRowsPerSecond":99.23664122137404,"processedRowsPerSecond":125.30120481927712}],"sink":{"description":"org.apache.spark.sql.execution.streaming.ConsoleTable$@514ba885","numOutputRows":1},"observedMetrics":{}}} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":12,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 4","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nWriteToDataSourceV2 (14)\n+- * HashAggregate (13)\n +- StateStoreSave (12)\n +- * HashAggregate (11)\n +- StateStoreRestore (10)\n +- Exchange (9)\n +- * HashAggregate (8)\n +- * HashAggregate (7)\n +- * SerializeFromObject (6)\n +- MapPartitions (5)\n +- DeserializeToObject (4)\n +- * Project (3)\n +- * Project (2)\n +- MicroBatchScan (1)\n\n\n(1) MicroBatchScan\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nArguments: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@7e7b182c, KafkaV2[Subscribe[test5]], {\"test5\":{\"0\":48757}}, {\"test5\":{\"0\":48799}}\n\n(2) Project [codegen id : 1]\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(3) Project [codegen id : 1]\nOutput [1]: [cast(value#8 as string) AS value#21]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(4) DeserializeToObject\nInput [1]: [value#21]\nArguments: value#21.toString, obj#27: java.lang.String\n\n(5) MapPartitions\nInput [1]: [obj#27]\nArguments: org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String\n\n(6) SerializeFromObject [codegen id : 2]\nInput [1]: [obj#28]\nArguments: [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]\n\n(7) HashAggregate [codegen id : 2]\nInput [1]: [value#29]\nKeys [1]: [value#29]\nFunctions [1]: [partial_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(8) HashAggregate [codegen id : 2]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(9) Exchange\nInput [2]: [value#29, count#38L]\nArguments: hashpartitioning(value#29, 2), true, [id=#1066]\n\n(10) StateStoreRestore\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 42efe357-12ef-4061-9b83-20bf4c29a257, opId = 0, ver = 0, numPartitions = 2], 2\n\n(11) HashAggregate [codegen id : 3]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(12) StateStoreSave\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 42efe357-12ef-4061-9b83-20bf4c29a257, opId = 0, ver = 0, numPartitions = 2], Append, 0, 2\n\n(13) HashAggregate [codegen id : 4]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count(1)#31L AS count#32L]\n\n(14) WriteToDataSourceV2\nInput [2]: [value#29, count#32L]\nArguments: org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@1717338b\n\n","sparkPlanInfo":{"nodeName":"WriteToDataSourceV2","simpleString":"WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@1717338b","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreSave","simpleString":"StateStoreSave [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 4, numPartitions = 2], Complete, 0, 2","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreRestore","simpleString":"StateStoreRestore [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 4, numPartitions = 2], 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(value#29, 2), true, [id=#990]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[partial_count(1)])","children":[{"nodeName":"SerializeFromObject","simpleString":"SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MapPartitions","simpleString":"MapPartitions org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String","children":[{"nodeName":"DeserializeToObject","simpleString":"DeserializeToObject value#21.toString, obj#27: java.lang.String","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [cast(value#8 as string) AS value#21]","children":[{"nodeName":"Project","simpleString":"Project [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MicroBatchScan","simpleString":"MicroBatchScan[key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13] class org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":708,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":707,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":704,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":705,"metricType":"timing"},{"name":"peak memory","accumulatorId":703,"metricType":"size"},{"name":"number of output rows","accumulatorId":702,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":706,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":699,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":700,"metricType":"timing"},{"name":"peak memory","accumulatorId":698,"metricType":"size"},{"name":"number of output rows","accumulatorId":697,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":701,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":696,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":648,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":649,"metricType":"nsTiming"},{"name":"records read","accumulatorId":646,"metricType":"sum"},{"name":"local bytes read","accumulatorId":644,"metricType":"size"},{"name":"fetch wait time","accumulatorId":645,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":642,"metricType":"size"},{"name":"local blocks read","accumulatorId":641,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":640,"metricType":"sum"},{"name":"data size","accumulatorId":639,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":643,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":647,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":695,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":692,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":693,"metricType":"timing"},{"name":"peak memory","accumulatorId":691,"metricType":"size"},{"name":"number of output rows","accumulatorId":690,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":694,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":689,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of inputs which are later than watermark ('inputs' are relative to operators)","accumulatorId":679,"metricType":"sum"},{"name":"number of total state rows","accumulatorId":680,"metricType":"sum"},{"name":"memory used by state","accumulatorId":685,"metricType":"size"},{"name":"count of cache hit on states cache in provider","accumulatorId":687,"metricType":"sum"},{"name":"number of output rows","accumulatorId":678,"metricType":"sum"},{"name":"estimated size of state only on current version","accumulatorId":686,"metricType":"size"},{"name":"count of cache miss on states cache in provider","accumulatorId":688,"metricType":"sum"},{"name":"time to commit changes","accumulatorId":684,"metricType":"timing"},{"name":"time to remove","accumulatorId":683,"metricType":"timing"},{"name":"number of updated state rows","accumulatorId":681,"metricType":"sum"},{"name":"time to update","accumulatorId":682,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":675,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":676,"metricType":"timing"},{"name":"peak memory","accumulatorId":674,"metricType":"size"},{"name":"number of output rows","accumulatorId":673,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":677,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":672,"metricType":"timing"}]}],"metadata":{},"metrics":[]},"time":1596020224817} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":13,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 4","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nWriteToDataSourceV2 (14)\n+- * HashAggregate (13)\n +- StateStoreSave (12)\n +- * HashAggregate (11)\n +- StateStoreRestore (10)\n +- Exchange (9)\n +- * HashAggregate (8)\n +- * HashAggregate (7)\n +- * SerializeFromObject (6)\n +- MapPartitions (5)\n +- DeserializeToObject (4)\n +- * Project (3)\n +- * Project (2)\n +- MicroBatchScan (1)\n\n\n(1) MicroBatchScan\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nArguments: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@7e7b182c, KafkaV2[Subscribe[test5]], {\"test5\":{\"0\":48757}}, {\"test5\":{\"0\":48799}}\n\n(2) Project [codegen id : 1]\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(3) Project [codegen id : 1]\nOutput [1]: [cast(value#8 as string) AS value#21]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(4) DeserializeToObject\nInput [1]: [value#21]\nArguments: value#21.toString, obj#27: java.lang.String\n\n(5) MapPartitions\nInput [1]: [obj#27]\nArguments: org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String\n\n(6) SerializeFromObject [codegen id : 2]\nInput [1]: [obj#28]\nArguments: [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]\n\n(7) HashAggregate [codegen id : 2]\nInput [1]: [value#29]\nKeys [1]: [value#29]\nFunctions [1]: [partial_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(8) HashAggregate [codegen id : 2]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(9) Exchange\nInput [2]: [value#29, count#38L]\nArguments: hashpartitioning(value#29, 2), true, [id=#1142]\n\n(10) StateStoreRestore\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 6fa28bd2-2924-4e01-8bbe-128888d2669b, opId = 0, ver = 0, numPartitions = 2], 2\n\n(11) HashAggregate [codegen id : 3]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(12) StateStoreSave\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 6fa28bd2-2924-4e01-8bbe-128888d2669b, opId = 0, ver = 0, numPartitions = 2], Append, 0, 2\n\n(13) HashAggregate [codegen id : 4]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count(1)#31L AS count#32L]\n\n(14) WriteToDataSourceV2\nInput [2]: [value#29, count#32L]\nArguments: org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@1717338b\n\n","sparkPlanInfo":{"nodeName":"WriteToDataSourceV2","simpleString":"WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@1717338b","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreSave","simpleString":"StateStoreSave [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 4, numPartitions = 2], Complete, 0, 2","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreRestore","simpleString":"StateStoreRestore [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 4, numPartitions = 2], 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(value#29, 2), true, [id=#990]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[partial_count(1)])","children":[{"nodeName":"SerializeFromObject","simpleString":"SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MapPartitions","simpleString":"MapPartitions org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String","children":[{"nodeName":"DeserializeToObject","simpleString":"DeserializeToObject value#21.toString, obj#27: java.lang.String","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [cast(value#8 as string) AS value#21]","children":[{"nodeName":"Project","simpleString":"Project [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MicroBatchScan","simpleString":"MicroBatchScan[key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13] class org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":708,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":707,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":704,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":705,"metricType":"timing"},{"name":"peak memory","accumulatorId":703,"metricType":"size"},{"name":"number of output rows","accumulatorId":702,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":706,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":699,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":700,"metricType":"timing"},{"name":"peak memory","accumulatorId":698,"metricType":"size"},{"name":"number of output rows","accumulatorId":697,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":701,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":696,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":648,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":649,"metricType":"nsTiming"},{"name":"records read","accumulatorId":646,"metricType":"sum"},{"name":"local bytes read","accumulatorId":644,"metricType":"size"},{"name":"fetch wait time","accumulatorId":645,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":642,"metricType":"size"},{"name":"local blocks read","accumulatorId":641,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":640,"metricType":"sum"},{"name":"data size","accumulatorId":639,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":643,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":647,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":695,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":692,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":693,"metricType":"timing"},{"name":"peak memory","accumulatorId":691,"metricType":"size"},{"name":"number of output rows","accumulatorId":690,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":694,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":689,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of inputs which are later than watermark ('inputs' are relative to operators)","accumulatorId":679,"metricType":"sum"},{"name":"number of total state rows","accumulatorId":680,"metricType":"sum"},{"name":"memory used by state","accumulatorId":685,"metricType":"size"},{"name":"count of cache hit on states cache in provider","accumulatorId":687,"metricType":"sum"},{"name":"number of output rows","accumulatorId":678,"metricType":"sum"},{"name":"estimated size of state only on current version","accumulatorId":686,"metricType":"size"},{"name":"count of cache miss on states cache in provider","accumulatorId":688,"metricType":"sum"},{"name":"time to commit changes","accumulatorId":684,"metricType":"timing"},{"name":"time to remove","accumulatorId":683,"metricType":"timing"},{"name":"number of updated state rows","accumulatorId":681,"metricType":"sum"},{"name":"time to update","accumulatorId":682,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":675,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":676,"metricType":"timing"},{"name":"peak memory","accumulatorId":674,"metricType":"size"},{"name":"number of output rows","accumulatorId":673,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":677,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":672,"metricType":"timing"}]}],"metadata":{},"metrics":[]},"time":1596020224849} +{"Event":"SparkListenerJobStart","Job ID":4,"Submission Time":1596020224928,"Stage Infos":[{"Stage ID":9,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":59,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"132\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[58],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":55,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"140\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[54],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":56,"Name":"StateStoreRDD","Scope":"{\"id\":\"139\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[55],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":57,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"136\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[56],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":58,"Name":"StateStoreRDD","Scope":"{\"id\":\"135\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[57],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[8],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Accumulables":[],"Resource Profile Id":0},{"Stage ID":8,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":54,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"140\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[53],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":53,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"141\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[52],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":51,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"147\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[50],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":49,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"152\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[48],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":52,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"146\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[51],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":50,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"148\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[49],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":48,"Name":"DataSourceRDD","Scope":"{\"id\":\"152\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Accumulables":[],"Resource Profile Id":0}],"Stage IDs":[9,8],"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 4","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"4","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"13","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":8,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":54,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"140\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[53],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":53,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"141\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[52],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":51,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"147\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[50],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":49,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"152\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[48],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":52,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"146\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[51],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":50,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"148\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[49],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":48,"Name":"DataSourceRDD","Scope":"{\"id\":\"152\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020224929,"Accumulables":[],"Resource Profile Id":0},"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 4","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"4","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"13","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":8,"Stage Attempt ID":0,"Task Info":{"Task ID":12,"Index":0,"Attempt":0,"Launch Time":1596020224941,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":8,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":12,"Index":0,"Attempt":0,"Launch Time":1596020224941,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020224979,"Failed":false,"Killed":false,"Accumulables":[{"ID":649,"Name":"shuffle write time","Update":"572754","Value":"572754","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":648,"Name":"shuffle records written","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":647,"Name":"shuffle bytes written","Update":"168","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":639,"Name":"data size","Update":"128","Value":"128","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":696,"Name":"duration","Update":"19","Value":"19","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":697,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":698,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":700,"Name":"time in aggregation build","Update":"13","Value":"13","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":702,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"time in aggregation build","Update":"9","Value":"9","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":707,"Name":"duration","Update":"19","Value":"19","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":708,"Name":"number of output rows","Update":"42","Value":"42","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.recordsRead","Update":42,"Value":42,"Internal":true,"Count Failed Values":true},{"ID":751,"Name":"internal.metrics.shuffle.write.writeTime","Update":572754,"Value":572754,"Internal":true,"Count Failed Values":true},{"ID":750,"Name":"internal.metrics.shuffle.write.recordsWritten","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":749,"Name":"internal.metrics.shuffle.write.bytesWritten","Update":168,"Value":168,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.peakExecutionMemory","Update":524288,"Value":524288,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.resultSize","Update":2544,"Value":2544,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorCpuTime","Update":27800373,"Value":27800373,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorRunTime","Update":28,"Value":28,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4768103,"Value":4768103,"Internal":true,"Count Failed Values":true},{"ID":731,"Name":"internal.metrics.executorDeserializeTime","Update":4,"Value":4,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":4,"Executor Deserialize CPU Time":4768103,"Executor Run Time":28,"Executor CPU Time":27800373,"Peak Execution Memory":524288,"Result Size":2544,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":168,"Shuffle Write Time":572754,"Shuffle Records Written":1},"Input Metrics":{"Bytes Read":0,"Records Read":42},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":8,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":54,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"140\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[53],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":53,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"141\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[52],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":51,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"147\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[50],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":49,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"152\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[48],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":52,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"146\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[51],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":50,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"148\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[49],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":48,"Name":"DataSourceRDD","Scope":"{\"id\":\"152\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020224929,"Completion Time":1596020224979,"Accumulables":[{"ID":732,"Name":"internal.metrics.executorDeserializeCpuTime","Value":4768103,"Internal":true,"Count Failed Values":true},{"ID":696,"Name":"duration","Value":"19","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":750,"Name":"internal.metrics.shuffle.write.recordsWritten","Value":1,"Internal":true,"Count Failed Values":true},{"ID":705,"Name":"time in aggregation build","Value":"9","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":735,"Name":"internal.metrics.resultSize","Value":2544,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"number of output rows","Value":"42","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.recordsRead","Value":42,"Internal":true,"Count Failed Values":true},{"ID":648,"Name":"shuffle records written","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":639,"Name":"data size","Value":"128","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":702,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":740,"Name":"internal.metrics.peakExecutionMemory","Value":524288,"Internal":true,"Count Failed Values":true},{"ID":731,"Name":"internal.metrics.executorDeserializeTime","Value":4,"Internal":true,"Count Failed Values":true},{"ID":749,"Name":"internal.metrics.shuffle.write.bytesWritten","Value":168,"Internal":true,"Count Failed Values":true},{"ID":698,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":734,"Name":"internal.metrics.executorCpuTime","Value":27800373,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"duration","Value":"19","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":647,"Name":"shuffle bytes written","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":733,"Name":"internal.metrics.executorRunTime","Value":28,"Internal":true,"Count Failed Values":true},{"ID":697,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":751,"Name":"internal.metrics.shuffle.write.writeTime","Value":572754,"Internal":true,"Count Failed Values":true},{"ID":700,"Name":"time in aggregation build","Value":"13","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":649,"Name":"shuffle write time","Value":"572754","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"}],"Resource Profile Id":0}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":9,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":59,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"132\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[58],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":55,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"140\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[54],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":56,"Name":"StateStoreRDD","Scope":"{\"id\":\"139\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[55],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":57,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"136\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[56],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":58,"Name":"StateStoreRDD","Scope":"{\"id\":\"135\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[57],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[8],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020224987,"Accumulables":[],"Resource Profile Id":0},"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 4","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"4","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"13","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":9,"Stage Attempt ID":0,"Task Info":{"Task ID":13,"Index":0,"Attempt":0,"Launch Time":1596020224994,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":9,"Stage Attempt ID":0,"Task Info":{"Task ID":14,"Index":1,"Attempt":0,"Launch Time":1596020224994,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":9,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":14,"Index":1,"Attempt":0,"Launch Time":1596020224994,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020225056,"Failed":false,"Killed":false,"Accumulables":[{"ID":672,"Name":"duration","Update":"3","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":677,"Name":"avg hash probe bucket list iters","Update":"10","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":673,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":674,"Name":"peak memory","Update":"4456448","Value":"4456448","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":676,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":682,"Name":"time to update","Update":"19","Value":"19","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":681,"Name":"number of updated state rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":683,"Name":"time to remove","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":684,"Name":"time to commit changes","Update":"11","Value":"11","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":686,"Name":"estimated size of state only on current version","Update":"368","Value":"368","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":678,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"count of cache hit on states cache in provider","Update":"8","Value":"8","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"memory used by state","Update":"784","Value":"784","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":680,"Name":"number of total state rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":689,"Name":"duration","Update":"19","Value":"19","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":690,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":691,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":693,"Name":"time in aggregation build","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":695,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":641,"Name":"local blocks read","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":645,"Name":"fetch wait time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":644,"Name":"local bytes read","Update":"168","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":646,"Name":"records read","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":773,"Name":"internal.metrics.shuffle.read.recordsRead","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":772,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":771,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":168,"Value":168,"Internal":true,"Count Failed Values":true},{"ID":770,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":769,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":768,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":767,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":765,"Name":"internal.metrics.peakExecutionMemory","Update":4718592,"Value":4718592,"Internal":true,"Count Failed Values":true},{"ID":760,"Name":"internal.metrics.resultSize","Update":5574,"Value":5574,"Internal":true,"Count Failed Values":true},{"ID":759,"Name":"internal.metrics.executorCpuTime","Update":19548688,"Value":19548688,"Internal":true,"Count Failed Values":true},{"ID":758,"Name":"internal.metrics.executorRunTime","Update":52,"Value":52,"Internal":true,"Count Failed Values":true},{"ID":757,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5622533,"Value":5622533,"Internal":true,"Count Failed Values":true},{"ID":756,"Name":"internal.metrics.executorDeserializeTime","Update":5,"Value":5,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":5,"Executor Deserialize CPU Time":5622533,"Executor Run Time":52,"Executor CPU Time":19548688,"Peak Execution Memory":4718592,"Result Size":5574,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":1,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":168,"Total Records Read":1},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":9,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":13,"Index":0,"Attempt":0,"Launch Time":1596020224994,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020225058,"Failed":false,"Killed":false,"Accumulables":[{"ID":672,"Name":"duration","Update":"2","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":674,"Name":"peak memory","Update":"262144","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":676,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":682,"Name":"time to update","Update":"4","Value":"23","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":683,"Name":"time to remove","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":684,"Name":"time to commit changes","Update":"35","Value":"46","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":686,"Name":"estimated size of state only on current version","Update":"88","Value":"456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"count of cache hit on states cache in provider","Update":"8","Value":"16","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"memory used by state","Update":"400","Value":"1184","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":689,"Name":"duration","Update":"4","Value":"23","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":691,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":693,"Name":"time in aggregation build","Update":"0","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":773,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":772,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":771,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":168,"Internal":true,"Count Failed Values":true},{"ID":770,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":769,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":768,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":767,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":765,"Name":"internal.metrics.peakExecutionMemory","Update":524288,"Value":5242880,"Internal":true,"Count Failed Values":true},{"ID":760,"Name":"internal.metrics.resultSize","Update":5311,"Value":10885,"Internal":true,"Count Failed Values":true},{"ID":759,"Name":"internal.metrics.executorCpuTime","Update":16813539,"Value":36362227,"Internal":true,"Count Failed Values":true},{"ID":758,"Name":"internal.metrics.executorRunTime","Update":55,"Value":107,"Internal":true,"Count Failed Values":true},{"ID":757,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4322992,"Value":9945525,"Internal":true,"Count Failed Values":true},{"ID":756,"Name":"internal.metrics.executorDeserializeTime","Update":4,"Value":9,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":4,"Executor Deserialize CPU Time":4322992,"Executor Run Time":55,"Executor CPU Time":16813539,"Peak Execution Memory":524288,"Result Size":5311,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":9,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":59,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"132\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[58],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":55,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"140\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[54],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":56,"Name":"StateStoreRDD","Scope":"{\"id\":\"139\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[55],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":57,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"136\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[56],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":58,"Name":"StateStoreRDD","Scope":"{\"id\":\"135\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[57],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[8],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020224987,"Completion Time":1596020225059,"Accumulables":[{"ID":765,"Name":"internal.metrics.peakExecutionMemory","Value":5242880,"Internal":true,"Count Failed Values":true},{"ID":756,"Name":"internal.metrics.executorDeserializeTime","Value":9,"Internal":true,"Count Failed Values":true},{"ID":678,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":759,"Name":"internal.metrics.executorCpuTime","Value":36362227,"Internal":true,"Count Failed Values":true},{"ID":768,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":1,"Internal":true,"Count Failed Values":true},{"ID":687,"Name":"count of cache hit on states cache in provider","Value":"16","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":681,"Name":"number of updated state rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":771,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":168,"Internal":true,"Count Failed Values":true},{"ID":690,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":672,"Name":"duration","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":645,"Name":"fetch wait time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":684,"Name":"time to commit changes","Value":"46","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":693,"Name":"time in aggregation build","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":770,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":683,"Name":"time to remove","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":773,"Name":"internal.metrics.shuffle.read.recordsRead","Value":1,"Internal":true,"Count Failed Values":true},{"ID":686,"Name":"estimated size of state only on current version","Value":"456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":695,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":677,"Name":"avg hash probe bucket list iters","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":767,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":641,"Name":"local blocks read","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":758,"Name":"internal.metrics.executorRunTime","Value":107,"Internal":true,"Count Failed Values":true},{"ID":644,"Name":"local bytes read","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":680,"Name":"number of total state rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":689,"Name":"duration","Value":"23","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":674,"Name":"peak memory","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":757,"Name":"internal.metrics.executorDeserializeCpuTime","Value":9945525,"Internal":true,"Count Failed Values":true},{"ID":769,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":760,"Name":"internal.metrics.resultSize","Value":10885,"Internal":true,"Count Failed Values":true},{"ID":772,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":646,"Name":"records read","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":682,"Name":"time to update","Value":"23","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":691,"Name":"peak memory","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":673,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":676,"Name":"time in aggregation build","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"memory used by state","Value":"1184","Internal":true,"Count Failed Values":true,"Metadata":"sql"}],"Resource Profile Id":0}} +{"Event":"SparkListenerJobEnd","Job ID":4,"Completion Time":1596020225059,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":14,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 4","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nLocalTableScan (1)\n\n\n(1) LocalTableScan\nOutput [2]: [value#102, count#103]\nArguments: [value#102, count#103]\n\n","sparkPlanInfo":{"nodeName":"LocalTableScan","simpleString":"LocalTableScan [value#102, count#103]","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":781,"metricType":"sum"}]},"time":1596020225079} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":14,"time":1596020225087} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":13,"time":1596020225087} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":12,"time":1596020225087} +{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent","progress":{"id":"8d268dc2-bc9c-4be8-97a9-b135d2943028","runId":"e225d92f-2545-48f8-87a2-9c0309580f8a","name":null,"timestamp":"2020-07-29T10:57:04.734Z","batchId":4,"batchDuration":387,"durationMs":{"triggerExecution":387,"queryPlanning":30,"getBatch":1,"latestOffset":3,"addBatch":306,"walCommit":12},"eventTime":{},"stateOperators":[{"numRowsTotal":1,"numRowsUpdated":1,"memoryUsedBytes":1184,"numLateInputs":0,"customMetrics":{"stateOnCurrentVersionSizeBytes":456,"loadedMapCacheHitCount":16,"loadedMapCacheMissCount":0}}],"sources":[{"description":"KafkaV2[Subscribe[test5]]","startOffset":"{\"test5\":{\"0\":48757}}","endOffset":"{\"test5\":{\"0\":48799}}","numInputRows":42,"inputRowsPerSecond":100.71942446043165,"processedRowsPerSecond":108.52713178294573}],"sink":{"description":"org.apache.spark.sql.execution.streaming.ConsoleTable$@514ba885","numOutputRows":1},"observedMetrics":{}}} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":15,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 5","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nWriteToDataSourceV2 (14)\n+- * HashAggregate (13)\n +- StateStoreSave (12)\n +- * HashAggregate (11)\n +- StateStoreRestore (10)\n +- Exchange (9)\n +- * HashAggregate (8)\n +- * HashAggregate (7)\n +- * SerializeFromObject (6)\n +- MapPartitions (5)\n +- DeserializeToObject (4)\n +- * Project (3)\n +- * Project (2)\n +- MicroBatchScan (1)\n\n\n(1) MicroBatchScan\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nArguments: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@7e7b182c, KafkaV2[Subscribe[test5]], {\"test5\":{\"0\":48799}}, {\"test5\":{\"0\":48837}}\n\n(2) Project [codegen id : 1]\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(3) Project [codegen id : 1]\nOutput [1]: [cast(value#8 as string) AS value#21]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(4) DeserializeToObject\nInput [1]: [value#21]\nArguments: value#21.toString, obj#27: java.lang.String\n\n(5) MapPartitions\nInput [1]: [obj#27]\nArguments: org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String\n\n(6) SerializeFromObject [codegen id : 2]\nInput [1]: [obj#28]\nArguments: [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]\n\n(7) HashAggregate [codegen id : 2]\nInput [1]: [value#29]\nKeys [1]: [value#29]\nFunctions [1]: [partial_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(8) HashAggregate [codegen id : 2]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(9) Exchange\nInput [2]: [value#29, count#38L]\nArguments: hashpartitioning(value#29, 2), true, [id=#1297]\n\n(10) StateStoreRestore\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 9579cc6c-8827-43f7-9678-7747602e493e, opId = 0, ver = 0, numPartitions = 2], 2\n\n(11) HashAggregate [codegen id : 3]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(12) StateStoreSave\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 9579cc6c-8827-43f7-9678-7747602e493e, opId = 0, ver = 0, numPartitions = 2], Append, 0, 2\n\n(13) HashAggregate [codegen id : 4]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count(1)#31L AS count#32L]\n\n(14) WriteToDataSourceV2\nInput [2]: [value#29, count#32L]\nArguments: org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@2c214312\n\n","sparkPlanInfo":{"nodeName":"WriteToDataSourceV2","simpleString":"WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@2c214312","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreSave","simpleString":"StateStoreSave [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 5, numPartitions = 2], Complete, 0, 2","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreRestore","simpleString":"StateStoreRestore [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 5, numPartitions = 2], 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(value#29, 2), true, [id=#1221]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[partial_count(1)])","children":[{"nodeName":"SerializeFromObject","simpleString":"SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MapPartitions","simpleString":"MapPartitions org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String","children":[{"nodeName":"DeserializeToObject","simpleString":"DeserializeToObject value#21.toString, obj#27: java.lang.String","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [cast(value#8 as string) AS value#21]","children":[{"nodeName":"Project","simpleString":"Project [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MicroBatchScan","simpleString":"MicroBatchScan[key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13] class org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":865,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":864,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":861,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":862,"metricType":"timing"},{"name":"peak memory","accumulatorId":860,"metricType":"size"},{"name":"number of output rows","accumulatorId":859,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":863,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":856,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":857,"metricType":"timing"},{"name":"peak memory","accumulatorId":855,"metricType":"size"},{"name":"number of output rows","accumulatorId":854,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":858,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":853,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":805,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":806,"metricType":"nsTiming"},{"name":"records read","accumulatorId":803,"metricType":"sum"},{"name":"local bytes read","accumulatorId":801,"metricType":"size"},{"name":"fetch wait time","accumulatorId":802,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":799,"metricType":"size"},{"name":"local blocks read","accumulatorId":798,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":797,"metricType":"sum"},{"name":"data size","accumulatorId":796,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":800,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":804,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":852,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":849,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":850,"metricType":"timing"},{"name":"peak memory","accumulatorId":848,"metricType":"size"},{"name":"number of output rows","accumulatorId":847,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":851,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":846,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of inputs which are later than watermark ('inputs' are relative to operators)","accumulatorId":836,"metricType":"sum"},{"name":"number of total state rows","accumulatorId":837,"metricType":"sum"},{"name":"memory used by state","accumulatorId":842,"metricType":"size"},{"name":"count of cache hit on states cache in provider","accumulatorId":844,"metricType":"sum"},{"name":"number of output rows","accumulatorId":835,"metricType":"sum"},{"name":"estimated size of state only on current version","accumulatorId":843,"metricType":"size"},{"name":"count of cache miss on states cache in provider","accumulatorId":845,"metricType":"sum"},{"name":"time to commit changes","accumulatorId":841,"metricType":"timing"},{"name":"time to remove","accumulatorId":840,"metricType":"timing"},{"name":"number of updated state rows","accumulatorId":838,"metricType":"sum"},{"name":"time to update","accumulatorId":839,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":832,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":833,"metricType":"timing"},{"name":"peak memory","accumulatorId":831,"metricType":"size"},{"name":"number of output rows","accumulatorId":830,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":834,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":829,"metricType":"timing"}]}],"metadata":{},"metrics":[]},"time":1596020225211} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":16,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 5","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nWriteToDataSourceV2 (14)\n+- * HashAggregate (13)\n +- StateStoreSave (12)\n +- * HashAggregate (11)\n +- StateStoreRestore (10)\n +- Exchange (9)\n +- * HashAggregate (8)\n +- * HashAggregate (7)\n +- * SerializeFromObject (6)\n +- MapPartitions (5)\n +- DeserializeToObject (4)\n +- * Project (3)\n +- * Project (2)\n +- MicroBatchScan (1)\n\n\n(1) MicroBatchScan\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nArguments: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@7e7b182c, KafkaV2[Subscribe[test5]], {\"test5\":{\"0\":48799}}, {\"test5\":{\"0\":48837}}\n\n(2) Project [codegen id : 1]\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(3) Project [codegen id : 1]\nOutput [1]: [cast(value#8 as string) AS value#21]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(4) DeserializeToObject\nInput [1]: [value#21]\nArguments: value#21.toString, obj#27: java.lang.String\n\n(5) MapPartitions\nInput [1]: [obj#27]\nArguments: org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String\n\n(6) SerializeFromObject [codegen id : 2]\nInput [1]: [obj#28]\nArguments: [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]\n\n(7) HashAggregate [codegen id : 2]\nInput [1]: [value#29]\nKeys [1]: [value#29]\nFunctions [1]: [partial_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(8) HashAggregate [codegen id : 2]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(9) Exchange\nInput [2]: [value#29, count#38L]\nArguments: hashpartitioning(value#29, 2), true, [id=#1373]\n\n(10) StateStoreRestore\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = b800d96e-7584-4e8d-8df8-c9b901b7f2e2, opId = 0, ver = 0, numPartitions = 2], 2\n\n(11) HashAggregate [codegen id : 3]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(12) StateStoreSave\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = b800d96e-7584-4e8d-8df8-c9b901b7f2e2, opId = 0, ver = 0, numPartitions = 2], Append, 0, 2\n\n(13) HashAggregate [codegen id : 4]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count(1)#31L AS count#32L]\n\n(14) WriteToDataSourceV2\nInput [2]: [value#29, count#32L]\nArguments: org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@2c214312\n\n","sparkPlanInfo":{"nodeName":"WriteToDataSourceV2","simpleString":"WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@2c214312","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreSave","simpleString":"StateStoreSave [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 5, numPartitions = 2], Complete, 0, 2","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreRestore","simpleString":"StateStoreRestore [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 5, numPartitions = 2], 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(value#29, 2), true, [id=#1221]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[partial_count(1)])","children":[{"nodeName":"SerializeFromObject","simpleString":"SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MapPartitions","simpleString":"MapPartitions org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String","children":[{"nodeName":"DeserializeToObject","simpleString":"DeserializeToObject value#21.toString, obj#27: java.lang.String","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [cast(value#8 as string) AS value#21]","children":[{"nodeName":"Project","simpleString":"Project [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MicroBatchScan","simpleString":"MicroBatchScan[key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13] class org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":865,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":864,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":861,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":862,"metricType":"timing"},{"name":"peak memory","accumulatorId":860,"metricType":"size"},{"name":"number of output rows","accumulatorId":859,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":863,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":856,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":857,"metricType":"timing"},{"name":"peak memory","accumulatorId":855,"metricType":"size"},{"name":"number of output rows","accumulatorId":854,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":858,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":853,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":805,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":806,"metricType":"nsTiming"},{"name":"records read","accumulatorId":803,"metricType":"sum"},{"name":"local bytes read","accumulatorId":801,"metricType":"size"},{"name":"fetch wait time","accumulatorId":802,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":799,"metricType":"size"},{"name":"local blocks read","accumulatorId":798,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":797,"metricType":"sum"},{"name":"data size","accumulatorId":796,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":800,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":804,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":852,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":849,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":850,"metricType":"timing"},{"name":"peak memory","accumulatorId":848,"metricType":"size"},{"name":"number of output rows","accumulatorId":847,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":851,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":846,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of inputs which are later than watermark ('inputs' are relative to operators)","accumulatorId":836,"metricType":"sum"},{"name":"number of total state rows","accumulatorId":837,"metricType":"sum"},{"name":"memory used by state","accumulatorId":842,"metricType":"size"},{"name":"count of cache hit on states cache in provider","accumulatorId":844,"metricType":"sum"},{"name":"number of output rows","accumulatorId":835,"metricType":"sum"},{"name":"estimated size of state only on current version","accumulatorId":843,"metricType":"size"},{"name":"count of cache miss on states cache in provider","accumulatorId":845,"metricType":"sum"},{"name":"time to commit changes","accumulatorId":841,"metricType":"timing"},{"name":"time to remove","accumulatorId":840,"metricType":"timing"},{"name":"number of updated state rows","accumulatorId":838,"metricType":"sum"},{"name":"time to update","accumulatorId":839,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":832,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":833,"metricType":"timing"},{"name":"peak memory","accumulatorId":831,"metricType":"size"},{"name":"number of output rows","accumulatorId":830,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":834,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":829,"metricType":"timing"}]}],"metadata":{},"metrics":[]},"time":1596020225270} +{"Event":"SparkListenerJobStart","Job ID":5,"Submission Time":1596020225342,"Stage Infos":[{"Stage ID":10,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":66,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"173\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[65],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":62,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"181\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[61],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":64,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"179\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[63],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":61,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"185\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[60],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":65,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"174\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[64],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":60,"Name":"DataSourceRDD","Scope":"{\"id\":\"185\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":63,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"180\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[62],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Accumulables":[],"Resource Profile Id":0},{"Stage ID":11,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":71,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"165\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[70],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":70,"Name":"StateStoreRDD","Scope":"{\"id\":\"168\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[69],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":69,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"169\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[68],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":67,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"173\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[66],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":68,"Name":"StateStoreRDD","Scope":"{\"id\":\"172\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[67],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[10],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Accumulables":[],"Resource Profile Id":0}],"Stage IDs":[10,11],"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 5","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"5","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"16","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":10,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":66,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"173\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[65],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":62,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"181\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[61],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":64,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"179\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[63],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":61,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"185\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[60],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":65,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"174\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[64],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":60,"Name":"DataSourceRDD","Scope":"{\"id\":\"185\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":63,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"180\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[62],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020225343,"Accumulables":[],"Resource Profile Id":0},"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 5","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"5","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"16","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":15,"Index":0,"Attempt":0,"Launch Time":1596020225359,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":15,"Index":0,"Attempt":0,"Launch Time":1596020225359,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020225400,"Failed":false,"Killed":false,"Accumulables":[{"ID":806,"Name":"shuffle write time","Update":"530930","Value":"530930","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":805,"Name":"shuffle records written","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":804,"Name":"shuffle bytes written","Update":"168","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":796,"Name":"data size","Update":"128","Value":"128","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":853,"Name":"duration","Update":"21","Value":"21","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":854,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":855,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":857,"Name":"time in aggregation build","Update":"14","Value":"14","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":859,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":860,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":862,"Name":"time in aggregation build","Update":"9","Value":"9","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":864,"Name":"duration","Update":"21","Value":"21","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":865,"Name":"number of output rows","Update":"38","Value":"38","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":910,"Name":"internal.metrics.input.recordsRead","Update":38,"Value":38,"Internal":true,"Count Failed Values":true},{"ID":908,"Name":"internal.metrics.shuffle.write.writeTime","Update":530930,"Value":530930,"Internal":true,"Count Failed Values":true},{"ID":907,"Name":"internal.metrics.shuffle.write.recordsWritten","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":906,"Name":"internal.metrics.shuffle.write.bytesWritten","Update":168,"Value":168,"Internal":true,"Count Failed Values":true},{"ID":897,"Name":"internal.metrics.peakExecutionMemory","Update":524288,"Value":524288,"Internal":true,"Count Failed Values":true},{"ID":892,"Name":"internal.metrics.resultSize","Update":2544,"Value":2544,"Internal":true,"Count Failed Values":true},{"ID":891,"Name":"internal.metrics.executorCpuTime","Update":22440089,"Value":22440089,"Internal":true,"Count Failed Values":true},{"ID":890,"Name":"internal.metrics.executorRunTime","Update":29,"Value":29,"Internal":true,"Count Failed Values":true},{"ID":889,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6808170,"Value":6808170,"Internal":true,"Count Failed Values":true},{"ID":888,"Name":"internal.metrics.executorDeserializeTime","Update":6,"Value":6,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":6,"Executor Deserialize CPU Time":6808170,"Executor Run Time":29,"Executor CPU Time":22440089,"Peak Execution Memory":524288,"Result Size":2544,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":168,"Shuffle Write Time":530930,"Shuffle Records Written":1},"Input Metrics":{"Bytes Read":0,"Records Read":38},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":10,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":66,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"173\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[65],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":62,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"181\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[61],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":64,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"179\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[63],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":61,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"185\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[60],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":65,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"174\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[64],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":60,"Name":"DataSourceRDD","Scope":"{\"id\":\"185\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":63,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"180\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[62],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020225343,"Completion Time":1596020225401,"Accumulables":[{"ID":855,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":891,"Name":"internal.metrics.executorCpuTime","Value":22440089,"Internal":true,"Count Failed Values":true},{"ID":864,"Name":"duration","Value":"21","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":804,"Name":"shuffle bytes written","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":908,"Name":"internal.metrics.shuffle.write.writeTime","Value":530930,"Internal":true,"Count Failed Values":true},{"ID":890,"Name":"internal.metrics.executorRunTime","Value":29,"Internal":true,"Count Failed Values":true},{"ID":857,"Name":"time in aggregation build","Value":"14","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":860,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":806,"Name":"shuffle write time","Value":"530930","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":854,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":853,"Name":"duration","Value":"21","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":862,"Name":"time in aggregation build","Value":"9","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":889,"Name":"internal.metrics.executorDeserializeCpuTime","Value":6808170,"Internal":true,"Count Failed Values":true},{"ID":907,"Name":"internal.metrics.shuffle.write.recordsWritten","Value":1,"Internal":true,"Count Failed Values":true},{"ID":892,"Name":"internal.metrics.resultSize","Value":2544,"Internal":true,"Count Failed Values":true},{"ID":910,"Name":"internal.metrics.input.recordsRead","Value":38,"Internal":true,"Count Failed Values":true},{"ID":865,"Name":"number of output rows","Value":"38","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":805,"Name":"shuffle records written","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":796,"Name":"data size","Value":"128","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":859,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":888,"Name":"internal.metrics.executorDeserializeTime","Value":6,"Internal":true,"Count Failed Values":true},{"ID":897,"Name":"internal.metrics.peakExecutionMemory","Value":524288,"Internal":true,"Count Failed Values":true},{"ID":906,"Name":"internal.metrics.shuffle.write.bytesWritten","Value":168,"Internal":true,"Count Failed Values":true}],"Resource Profile Id":0}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":11,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":71,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"165\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[70],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":70,"Name":"StateStoreRDD","Scope":"{\"id\":\"168\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[69],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":69,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"169\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[68],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":67,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"173\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[66],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":68,"Name":"StateStoreRDD","Scope":"{\"id\":\"172\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[67],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[10],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020225410,"Accumulables":[],"Resource Profile Id":0},"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 5","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"5","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"16","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":16,"Index":0,"Attempt":0,"Launch Time":1596020225417,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":17,"Index":1,"Attempt":0,"Launch Time":1596020225417,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":17,"Index":1,"Attempt":0,"Launch Time":1596020225417,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020225498,"Failed":false,"Killed":false,"Accumulables":[{"ID":829,"Name":"duration","Update":"3","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":834,"Name":"avg hash probe bucket list iters","Update":"10","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":830,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":831,"Name":"peak memory","Update":"4456448","Value":"4456448","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":833,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":839,"Name":"time to update","Update":"11","Value":"11","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":838,"Name":"number of updated state rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":840,"Name":"time to remove","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":841,"Name":"time to commit changes","Update":"37","Value":"37","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":843,"Name":"estimated size of state only on current version","Update":"368","Value":"368","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":835,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":844,"Name":"count of cache hit on states cache in provider","Update":"10","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":842,"Name":"memory used by state","Update":"784","Value":"784","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":837,"Name":"number of total state rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":846,"Name":"duration","Update":"11","Value":"11","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":847,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":848,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":850,"Name":"time in aggregation build","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":852,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":798,"Name":"local blocks read","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":802,"Name":"fetch wait time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":801,"Name":"local bytes read","Update":"168","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":803,"Name":"records read","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":930,"Name":"internal.metrics.shuffle.read.recordsRead","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":929,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":928,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":168,"Value":168,"Internal":true,"Count Failed Values":true},{"ID":927,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":926,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":925,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":924,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":922,"Name":"internal.metrics.peakExecutionMemory","Update":4718592,"Value":4718592,"Internal":true,"Count Failed Values":true},{"ID":917,"Name":"internal.metrics.resultSize","Update":5574,"Value":5574,"Internal":true,"Count Failed Values":true},{"ID":916,"Name":"internal.metrics.executorCpuTime","Update":17945299,"Value":17945299,"Internal":true,"Count Failed Values":true},{"ID":915,"Name":"internal.metrics.executorRunTime","Update":68,"Value":68,"Internal":true,"Count Failed Values":true},{"ID":914,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3451032,"Value":3451032,"Internal":true,"Count Failed Values":true},{"ID":913,"Name":"internal.metrics.executorDeserializeTime","Update":3,"Value":3,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":3,"Executor Deserialize CPU Time":3451032,"Executor Run Time":68,"Executor CPU Time":17945299,"Peak Execution Memory":4718592,"Result Size":5574,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":1,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":168,"Total Records Read":1},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":16,"Index":0,"Attempt":0,"Launch Time":1596020225417,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020225509,"Failed":false,"Killed":false,"Accumulables":[{"ID":829,"Name":"duration","Update":"2","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":831,"Name":"peak memory","Update":"262144","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":833,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":839,"Name":"time to update","Update":"4","Value":"15","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":840,"Name":"time to remove","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":841,"Name":"time to commit changes","Update":"50","Value":"87","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":843,"Name":"estimated size of state only on current version","Update":"88","Value":"456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":844,"Name":"count of cache hit on states cache in provider","Update":"10","Value":"20","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":842,"Name":"memory used by state","Update":"400","Value":"1184","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":846,"Name":"duration","Update":"4","Value":"15","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":848,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":850,"Name":"time in aggregation build","Update":"0","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":930,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":929,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":928,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":168,"Internal":true,"Count Failed Values":true},{"ID":927,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":926,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":925,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":924,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":922,"Name":"internal.metrics.peakExecutionMemory","Update":524288,"Value":5242880,"Internal":true,"Count Failed Values":true},{"ID":917,"Name":"internal.metrics.resultSize","Update":5311,"Value":10885,"Internal":true,"Count Failed Values":true},{"ID":916,"Name":"internal.metrics.executorCpuTime","Update":15599091,"Value":33544390,"Internal":true,"Count Failed Values":true},{"ID":915,"Name":"internal.metrics.executorRunTime","Update":84,"Value":152,"Internal":true,"Count Failed Values":true},{"ID":914,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4357806,"Value":7808838,"Internal":true,"Count Failed Values":true},{"ID":913,"Name":"internal.metrics.executorDeserializeTime","Update":4,"Value":7,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":4,"Executor Deserialize CPU Time":4357806,"Executor Run Time":84,"Executor CPU Time":15599091,"Peak Execution Memory":524288,"Result Size":5311,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":11,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":71,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"165\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[70],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":70,"Name":"StateStoreRDD","Scope":"{\"id\":\"168\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[69],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":69,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"169\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[68],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":67,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"173\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[66],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":68,"Name":"StateStoreRDD","Scope":"{\"id\":\"172\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[67],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[10],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020225410,"Completion Time":1596020225514,"Accumulables":[{"ID":846,"Name":"duration","Value":"15","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":837,"Name":"number of total state rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":801,"Name":"local bytes read","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":831,"Name":"peak memory","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":926,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":917,"Name":"internal.metrics.resultSize","Value":10885,"Internal":true,"Count Failed Values":true},{"ID":830,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":848,"Name":"peak memory","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":803,"Name":"records read","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":839,"Name":"time to update","Value":"15","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":929,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":833,"Name":"time in aggregation build","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":842,"Name":"memory used by state","Value":"1184","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":914,"Name":"internal.metrics.executorDeserializeCpuTime","Value":7808838,"Internal":true,"Count Failed Values":true},{"ID":922,"Name":"internal.metrics.peakExecutionMemory","Value":5242880,"Internal":true,"Count Failed Values":true},{"ID":913,"Name":"internal.metrics.executorDeserializeTime","Value":7,"Internal":true,"Count Failed Values":true},{"ID":925,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":1,"Internal":true,"Count Failed Values":true},{"ID":844,"Name":"count of cache hit on states cache in provider","Value":"20","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":835,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":916,"Name":"internal.metrics.executorCpuTime","Value":33544390,"Internal":true,"Count Failed Values":true},{"ID":829,"Name":"duration","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":928,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":168,"Internal":true,"Count Failed Values":true},{"ID":802,"Name":"fetch wait time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":838,"Name":"number of updated state rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":847,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":850,"Name":"time in aggregation build","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":841,"Name":"time to commit changes","Value":"87","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":927,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":930,"Name":"internal.metrics.shuffle.read.recordsRead","Value":1,"Internal":true,"Count Failed Values":true},{"ID":840,"Name":"time to remove","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":834,"Name":"avg hash probe bucket list iters","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":852,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":798,"Name":"local blocks read","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":843,"Name":"estimated size of state only on current version","Value":"456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":915,"Name":"internal.metrics.executorRunTime","Value":152,"Internal":true,"Count Failed Values":true},{"ID":924,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true}],"Resource Profile Id":0}} +{"Event":"SparkListenerJobEnd","Job ID":5,"Completion Time":1596020225514,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":17,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 5","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nLocalTableScan (1)\n\n\n(1) LocalTableScan\nOutput [2]: [value#116, count#117]\nArguments: [value#116, count#117]\n\n","sparkPlanInfo":{"nodeName":"LocalTableScan","simpleString":"LocalTableScan [value#116, count#117]","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":938,"metricType":"sum"}]},"time":1596020225536} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":17,"time":1596020225541} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":16,"time":1596020225542} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":15,"time":1596020225542} +{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent","progress":{"id":"8d268dc2-bc9c-4be8-97a9-b135d2943028","runId":"e225d92f-2545-48f8-87a2-9c0309580f8a","name":null,"timestamp":"2020-07-29T10:57:05.123Z","batchId":5,"batchDuration":437,"durationMs":{"triggerExecution":437,"queryPlanning":35,"getBatch":1,"latestOffset":3,"addBatch":361,"walCommit":18},"eventTime":{},"stateOperators":[{"numRowsTotal":1,"numRowsUpdated":1,"memoryUsedBytes":1184,"numLateInputs":0,"customMetrics":{"stateOnCurrentVersionSizeBytes":456,"loadedMapCacheHitCount":20,"loadedMapCacheMissCount":0}}],"sources":[{"description":"KafkaV2[Subscribe[test5]]","startOffset":"{\"test5\":{\"0\":48799}}","endOffset":"{\"test5\":{\"0\":48837}}","numInputRows":38,"inputRowsPerSecond":97.68637532133675,"processedRowsPerSecond":86.95652173913044}],"sink":{"description":"org.apache.spark.sql.execution.streaming.ConsoleTable$@514ba885","numOutputRows":1},"observedMetrics":{}}} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":18,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 6","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nWriteToDataSourceV2 (14)\n+- * HashAggregate (13)\n +- StateStoreSave (12)\n +- * HashAggregate (11)\n +- StateStoreRestore (10)\n +- Exchange (9)\n +- * HashAggregate (8)\n +- * HashAggregate (7)\n +- * SerializeFromObject (6)\n +- MapPartitions (5)\n +- DeserializeToObject (4)\n +- * Project (3)\n +- * Project (2)\n +- MicroBatchScan (1)\n\n\n(1) MicroBatchScan\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nArguments: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@7e7b182c, KafkaV2[Subscribe[test5]], {\"test5\":{\"0\":48837}}, {\"test5\":{\"0\":48881}}\n\n(2) Project [codegen id : 1]\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(3) Project [codegen id : 1]\nOutput [1]: [cast(value#8 as string) AS value#21]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(4) DeserializeToObject\nInput [1]: [value#21]\nArguments: value#21.toString, obj#27: java.lang.String\n\n(5) MapPartitions\nInput [1]: [obj#27]\nArguments: org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String\n\n(6) SerializeFromObject [codegen id : 2]\nInput [1]: [obj#28]\nArguments: [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]\n\n(7) HashAggregate [codegen id : 2]\nInput [1]: [value#29]\nKeys [1]: [value#29]\nFunctions [1]: [partial_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(8) HashAggregate [codegen id : 2]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(9) Exchange\nInput [2]: [value#29, count#38L]\nArguments: hashpartitioning(value#29, 2), true, [id=#1528]\n\n(10) StateStoreRestore\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 6a12c2d9-8d02-4241-93fc-f53da01bb454, opId = 0, ver = 0, numPartitions = 2], 2\n\n(11) HashAggregate [codegen id : 3]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(12) StateStoreSave\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 6a12c2d9-8d02-4241-93fc-f53da01bb454, opId = 0, ver = 0, numPartitions = 2], Append, 0, 2\n\n(13) HashAggregate [codegen id : 4]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count(1)#31L AS count#32L]\n\n(14) WriteToDataSourceV2\nInput [2]: [value#29, count#32L]\nArguments: org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@27ec018d\n\n","sparkPlanInfo":{"nodeName":"WriteToDataSourceV2","simpleString":"WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@27ec018d","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreSave","simpleString":"StateStoreSave [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 6, numPartitions = 2], Complete, 0, 2","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreRestore","simpleString":"StateStoreRestore [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 6, numPartitions = 2], 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(value#29, 2), true, [id=#1452]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[partial_count(1)])","children":[{"nodeName":"SerializeFromObject","simpleString":"SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MapPartitions","simpleString":"MapPartitions org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String","children":[{"nodeName":"DeserializeToObject","simpleString":"DeserializeToObject value#21.toString, obj#27: java.lang.String","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [cast(value#8 as string) AS value#21]","children":[{"nodeName":"Project","simpleString":"Project [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MicroBatchScan","simpleString":"MicroBatchScan[key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13] class org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":1022,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1021,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1018,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1019,"metricType":"timing"},{"name":"peak memory","accumulatorId":1017,"metricType":"size"},{"name":"number of output rows","accumulatorId":1016,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1020,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1013,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1014,"metricType":"timing"},{"name":"peak memory","accumulatorId":1012,"metricType":"size"},{"name":"number of output rows","accumulatorId":1011,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1015,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1010,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":962,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":963,"metricType":"nsTiming"},{"name":"records read","accumulatorId":960,"metricType":"sum"},{"name":"local bytes read","accumulatorId":958,"metricType":"size"},{"name":"fetch wait time","accumulatorId":959,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":956,"metricType":"size"},{"name":"local blocks read","accumulatorId":955,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":954,"metricType":"sum"},{"name":"data size","accumulatorId":953,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":957,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":961,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":1009,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1006,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1007,"metricType":"timing"},{"name":"peak memory","accumulatorId":1005,"metricType":"size"},{"name":"number of output rows","accumulatorId":1004,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1008,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1003,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of inputs which are later than watermark ('inputs' are relative to operators)","accumulatorId":993,"metricType":"sum"},{"name":"number of total state rows","accumulatorId":994,"metricType":"sum"},{"name":"memory used by state","accumulatorId":999,"metricType":"size"},{"name":"count of cache hit on states cache in provider","accumulatorId":1001,"metricType":"sum"},{"name":"number of output rows","accumulatorId":992,"metricType":"sum"},{"name":"estimated size of state only on current version","accumulatorId":1000,"metricType":"size"},{"name":"count of cache miss on states cache in provider","accumulatorId":1002,"metricType":"sum"},{"name":"time to commit changes","accumulatorId":998,"metricType":"timing"},{"name":"time to remove","accumulatorId":997,"metricType":"timing"},{"name":"number of updated state rows","accumulatorId":995,"metricType":"sum"},{"name":"time to update","accumulatorId":996,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":989,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":990,"metricType":"timing"},{"name":"peak memory","accumulatorId":988,"metricType":"size"},{"name":"number of output rows","accumulatorId":987,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":991,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":986,"metricType":"timing"}]}],"metadata":{},"metrics":[]},"time":1596020225657} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":19,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 6","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nWriteToDataSourceV2 (14)\n+- * HashAggregate (13)\n +- StateStoreSave (12)\n +- * HashAggregate (11)\n +- StateStoreRestore (10)\n +- Exchange (9)\n +- * HashAggregate (8)\n +- * HashAggregate (7)\n +- * SerializeFromObject (6)\n +- MapPartitions (5)\n +- DeserializeToObject (4)\n +- * Project (3)\n +- * Project (2)\n +- MicroBatchScan (1)\n\n\n(1) MicroBatchScan\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nArguments: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@7e7b182c, KafkaV2[Subscribe[test5]], {\"test5\":{\"0\":48837}}, {\"test5\":{\"0\":48881}}\n\n(2) Project [codegen id : 1]\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(3) Project [codegen id : 1]\nOutput [1]: [cast(value#8 as string) AS value#21]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(4) DeserializeToObject\nInput [1]: [value#21]\nArguments: value#21.toString, obj#27: java.lang.String\n\n(5) MapPartitions\nInput [1]: [obj#27]\nArguments: org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String\n\n(6) SerializeFromObject [codegen id : 2]\nInput [1]: [obj#28]\nArguments: [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]\n\n(7) HashAggregate [codegen id : 2]\nInput [1]: [value#29]\nKeys [1]: [value#29]\nFunctions [1]: [partial_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(8) HashAggregate [codegen id : 2]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(9) Exchange\nInput [2]: [value#29, count#38L]\nArguments: hashpartitioning(value#29, 2), true, [id=#1604]\n\n(10) StateStoreRestore\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 96456757-8d0b-46da-a006-9fe2cb6fc936, opId = 0, ver = 0, numPartitions = 2], 2\n\n(11) HashAggregate [codegen id : 3]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(12) StateStoreSave\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = 96456757-8d0b-46da-a006-9fe2cb6fc936, opId = 0, ver = 0, numPartitions = 2], Append, 0, 2\n\n(13) HashAggregate [codegen id : 4]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count(1)#31L AS count#32L]\n\n(14) WriteToDataSourceV2\nInput [2]: [value#29, count#32L]\nArguments: org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@27ec018d\n\n","sparkPlanInfo":{"nodeName":"WriteToDataSourceV2","simpleString":"WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@27ec018d","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreSave","simpleString":"StateStoreSave [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 6, numPartitions = 2], Complete, 0, 2","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreRestore","simpleString":"StateStoreRestore [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 6, numPartitions = 2], 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(value#29, 2), true, [id=#1452]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[partial_count(1)])","children":[{"nodeName":"SerializeFromObject","simpleString":"SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MapPartitions","simpleString":"MapPartitions org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String","children":[{"nodeName":"DeserializeToObject","simpleString":"DeserializeToObject value#21.toString, obj#27: java.lang.String","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [cast(value#8 as string) AS value#21]","children":[{"nodeName":"Project","simpleString":"Project [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MicroBatchScan","simpleString":"MicroBatchScan[key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13] class org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":1022,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1021,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1018,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1019,"metricType":"timing"},{"name":"peak memory","accumulatorId":1017,"metricType":"size"},{"name":"number of output rows","accumulatorId":1016,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1020,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1013,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1014,"metricType":"timing"},{"name":"peak memory","accumulatorId":1012,"metricType":"size"},{"name":"number of output rows","accumulatorId":1011,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1015,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1010,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":962,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":963,"metricType":"nsTiming"},{"name":"records read","accumulatorId":960,"metricType":"sum"},{"name":"local bytes read","accumulatorId":958,"metricType":"size"},{"name":"fetch wait time","accumulatorId":959,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":956,"metricType":"size"},{"name":"local blocks read","accumulatorId":955,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":954,"metricType":"sum"},{"name":"data size","accumulatorId":953,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":957,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":961,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":1009,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1006,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1007,"metricType":"timing"},{"name":"peak memory","accumulatorId":1005,"metricType":"size"},{"name":"number of output rows","accumulatorId":1004,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1008,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1003,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of inputs which are later than watermark ('inputs' are relative to operators)","accumulatorId":993,"metricType":"sum"},{"name":"number of total state rows","accumulatorId":994,"metricType":"sum"},{"name":"memory used by state","accumulatorId":999,"metricType":"size"},{"name":"count of cache hit on states cache in provider","accumulatorId":1001,"metricType":"sum"},{"name":"number of output rows","accumulatorId":992,"metricType":"sum"},{"name":"estimated size of state only on current version","accumulatorId":1000,"metricType":"size"},{"name":"count of cache miss on states cache in provider","accumulatorId":1002,"metricType":"sum"},{"name":"time to commit changes","accumulatorId":998,"metricType":"timing"},{"name":"time to remove","accumulatorId":997,"metricType":"timing"},{"name":"number of updated state rows","accumulatorId":995,"metricType":"sum"},{"name":"time to update","accumulatorId":996,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":989,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":990,"metricType":"timing"},{"name":"peak memory","accumulatorId":988,"metricType":"size"},{"name":"number of output rows","accumulatorId":987,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":991,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":986,"metricType":"timing"}]}],"metadata":{},"metrics":[]},"time":1596020225687} +{"Event":"SparkListenerJobStart","Job ID":6,"Submission Time":1596020225759,"Stage Infos":[{"Stage ID":12,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":78,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"206\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[77],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":75,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"213\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[74],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":74,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"214\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[73],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":77,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"207\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[76],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":72,"Name":"DataSourceRDD","Scope":"{\"id\":\"218\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":73,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"218\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[72],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":76,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"212\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[75],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Accumulables":[],"Resource Profile Id":0},{"Stage ID":13,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":83,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"198\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[82],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":81,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"202\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[80],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":79,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"206\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[78],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":80,"Name":"StateStoreRDD","Scope":"{\"id\":\"205\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[79],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":82,"Name":"StateStoreRDD","Scope":"{\"id\":\"201\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[81],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[12],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Accumulables":[],"Resource Profile Id":0}],"Stage IDs":[12,13],"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 6","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"6","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"19","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":12,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":78,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"206\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[77],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":75,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"213\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[74],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":74,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"214\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[73],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":77,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"207\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[76],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":72,"Name":"DataSourceRDD","Scope":"{\"id\":\"218\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":73,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"218\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[72],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":76,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"212\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[75],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020225760,"Accumulables":[],"Resource Profile Id":0},"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 6","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"6","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"19","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":12,"Stage Attempt ID":0,"Task Info":{"Task ID":18,"Index":0,"Attempt":0,"Launch Time":1596020225766,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":12,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":18,"Index":0,"Attempt":0,"Launch Time":1596020225766,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020225796,"Failed":false,"Killed":false,"Accumulables":[{"ID":963,"Name":"shuffle write time","Update":"543836","Value":"543836","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":962,"Name":"shuffle records written","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":961,"Name":"shuffle bytes written","Update":"168","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":953,"Name":"data size","Update":"128","Value":"128","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1010,"Name":"duration","Update":"17","Value":"17","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1011,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1012,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1014,"Name":"time in aggregation build","Update":"11","Value":"11","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1016,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1017,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1019,"Name":"time in aggregation build","Update":"8","Value":"8","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1021,"Name":"duration","Update":"17","Value":"17","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1022,"Name":"number of output rows","Update":"44","Value":"44","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1067,"Name":"internal.metrics.input.recordsRead","Update":44,"Value":44,"Internal":true,"Count Failed Values":true},{"ID":1065,"Name":"internal.metrics.shuffle.write.writeTime","Update":543836,"Value":543836,"Internal":true,"Count Failed Values":true},{"ID":1064,"Name":"internal.metrics.shuffle.write.recordsWritten","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":1063,"Name":"internal.metrics.shuffle.write.bytesWritten","Update":168,"Value":168,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.peakExecutionMemory","Update":524288,"Value":524288,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.resultSize","Update":2544,"Value":2544,"Internal":true,"Count Failed Values":true},{"ID":1048,"Name":"internal.metrics.executorCpuTime","Update":23733439,"Value":23733439,"Internal":true,"Count Failed Values":true},{"ID":1047,"Name":"internal.metrics.executorRunTime","Update":23,"Value":23,"Internal":true,"Count Failed Values":true},{"ID":1046,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3714406,"Value":3714406,"Internal":true,"Count Failed Values":true},{"ID":1045,"Name":"internal.metrics.executorDeserializeTime","Update":3,"Value":3,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":3,"Executor Deserialize CPU Time":3714406,"Executor Run Time":23,"Executor CPU Time":23733439,"Peak Execution Memory":524288,"Result Size":2544,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":168,"Shuffle Write Time":543836,"Shuffle Records Written":1},"Input Metrics":{"Bytes Read":0,"Records Read":44},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":12,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":78,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"206\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[77],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":75,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"213\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[74],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":74,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"214\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[73],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":77,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"207\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[76],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":72,"Name":"DataSourceRDD","Scope":"{\"id\":\"218\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":73,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"218\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[72],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":76,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"212\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[75],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020225760,"Completion Time":1596020225797,"Accumulables":[{"ID":1064,"Name":"internal.metrics.shuffle.write.recordsWritten","Value":1,"Internal":true,"Count Failed Values":true},{"ID":1010,"Name":"duration","Value":"17","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1046,"Name":"internal.metrics.executorDeserializeCpuTime","Value":3714406,"Internal":true,"Count Failed Values":true},{"ID":1019,"Name":"time in aggregation build","Value":"8","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1067,"Name":"internal.metrics.input.recordsRead","Value":44,"Internal":true,"Count Failed Values":true},{"ID":1022,"Name":"number of output rows","Value":"44","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1049,"Name":"internal.metrics.resultSize","Value":2544,"Internal":true,"Count Failed Values":true},{"ID":1016,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":962,"Name":"shuffle records written","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":953,"Name":"data size","Value":"128","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1054,"Name":"internal.metrics.peakExecutionMemory","Value":524288,"Internal":true,"Count Failed Values":true},{"ID":1045,"Name":"internal.metrics.executorDeserializeTime","Value":3,"Internal":true,"Count Failed Values":true},{"ID":1063,"Name":"internal.metrics.shuffle.write.bytesWritten","Value":168,"Internal":true,"Count Failed Values":true},{"ID":1048,"Name":"internal.metrics.executorCpuTime","Value":23733439,"Internal":true,"Count Failed Values":true},{"ID":1012,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1021,"Name":"duration","Value":"17","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":961,"Name":"shuffle bytes written","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1065,"Name":"internal.metrics.shuffle.write.writeTime","Value":543836,"Internal":true,"Count Failed Values":true},{"ID":1047,"Name":"internal.metrics.executorRunTime","Value":23,"Internal":true,"Count Failed Values":true},{"ID":1014,"Name":"time in aggregation build","Value":"11","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":963,"Name":"shuffle write time","Value":"543836","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1017,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1011,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"}],"Resource Profile Id":0}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":13,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":83,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"198\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[82],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":81,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"202\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[80],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":79,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"206\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[78],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":80,"Name":"StateStoreRDD","Scope":"{\"id\":\"205\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[79],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":82,"Name":"StateStoreRDD","Scope":"{\"id\":\"201\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[81],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[12],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020225801,"Accumulables":[],"Resource Profile Id":0},"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 6","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"6","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"19","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":13,"Stage Attempt ID":0,"Task Info":{"Task ID":19,"Index":0,"Attempt":0,"Launch Time":1596020225808,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":13,"Stage Attempt ID":0,"Task Info":{"Task ID":20,"Index":1,"Attempt":0,"Launch Time":1596020225809,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":13,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":19,"Index":0,"Attempt":0,"Launch Time":1596020225808,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020225868,"Failed":false,"Killed":false,"Accumulables":[{"ID":986,"Name":"duration","Update":"3","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":988,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":990,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":996,"Name":"time to update","Update":"4","Value":"4","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":997,"Name":"time to remove","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":998,"Name":"time to commit changes","Update":"26","Value":"26","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1000,"Name":"estimated size of state only on current version","Update":"88","Value":"88","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1001,"Name":"count of cache hit on states cache in provider","Update":"12","Value":"12","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":999,"Name":"memory used by state","Update":"400","Value":"400","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1003,"Name":"duration","Update":"4","Value":"4","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1005,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1007,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1087,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1086,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1085,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1084,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1083,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1082,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1081,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1079,"Name":"internal.metrics.peakExecutionMemory","Update":524288,"Value":524288,"Internal":true,"Count Failed Values":true},{"ID":1074,"Name":"internal.metrics.resultSize","Update":5311,"Value":5311,"Internal":true,"Count Failed Values":true},{"ID":1073,"Name":"internal.metrics.executorCpuTime","Update":17503528,"Value":17503528,"Internal":true,"Count Failed Values":true},{"ID":1072,"Name":"internal.metrics.executorRunTime","Update":50,"Value":50,"Internal":true,"Count Failed Values":true},{"ID":1071,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4255703,"Value":4255703,"Internal":true,"Count Failed Values":true},{"ID":1070,"Name":"internal.metrics.executorDeserializeTime","Update":4,"Value":4,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":4,"Executor Deserialize CPU Time":4255703,"Executor Run Time":50,"Executor CPU Time":17503528,"Peak Execution Memory":524288,"Result Size":5311,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":13,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":20,"Index":1,"Attempt":0,"Launch Time":1596020225809,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020225874,"Failed":false,"Killed":false,"Accumulables":[{"ID":986,"Name":"duration","Update":"2","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":991,"Name":"avg hash probe bucket list iters","Update":"10","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":987,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":988,"Name":"peak memory","Update":"4456448","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":990,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":996,"Name":"time to update","Update":"15","Value":"19","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":995,"Name":"number of updated state rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":997,"Name":"time to remove","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":998,"Name":"time to commit changes","Update":"23","Value":"49","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1000,"Name":"estimated size of state only on current version","Update":"368","Value":"456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":992,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1001,"Name":"count of cache hit on states cache in provider","Update":"12","Value":"24","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":999,"Name":"memory used by state","Update":"784","Value":"1184","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":994,"Name":"number of total state rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1003,"Name":"duration","Update":"15","Value":"19","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1004,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1005,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1007,"Name":"time in aggregation build","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1009,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":955,"Name":"local blocks read","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":959,"Name":"fetch wait time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":958,"Name":"local bytes read","Update":"168","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":960,"Name":"records read","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1087,"Name":"internal.metrics.shuffle.read.recordsRead","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":1086,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1085,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":168,"Value":168,"Internal":true,"Count Failed Values":true},{"ID":1084,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1083,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1082,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":1081,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1079,"Name":"internal.metrics.peakExecutionMemory","Update":4718592,"Value":5242880,"Internal":true,"Count Failed Values":true},{"ID":1074,"Name":"internal.metrics.resultSize","Update":5574,"Value":10885,"Internal":true,"Count Failed Values":true},{"ID":1073,"Name":"internal.metrics.executorCpuTime","Update":17516707,"Value":35020235,"Internal":true,"Count Failed Values":true},{"ID":1072,"Name":"internal.metrics.executorRunTime","Update":56,"Value":106,"Internal":true,"Count Failed Values":true},{"ID":1071,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3750230,"Value":8005933,"Internal":true,"Count Failed Values":true},{"ID":1070,"Name":"internal.metrics.executorDeserializeTime","Update":3,"Value":7,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":3,"Executor Deserialize CPU Time":3750230,"Executor Run Time":56,"Executor CPU Time":17516707,"Peak Execution Memory":4718592,"Result Size":5574,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":1,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":168,"Total Records Read":1},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":13,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":83,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"198\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[82],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":81,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"202\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[80],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":79,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"206\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[78],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":80,"Name":"StateStoreRDD","Scope":"{\"id\":\"205\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[79],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":82,"Name":"StateStoreRDD","Scope":"{\"id\":\"201\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[81],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[12],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020225801,"Completion Time":1596020225874,"Accumulables":[{"ID":1070,"Name":"internal.metrics.executorDeserializeTime","Value":7,"Internal":true,"Count Failed Values":true},{"ID":1079,"Name":"internal.metrics.peakExecutionMemory","Value":5242880,"Internal":true,"Count Failed Values":true},{"ID":992,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1073,"Name":"internal.metrics.executorCpuTime","Value":35020235,"Internal":true,"Count Failed Values":true},{"ID":1082,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":1,"Internal":true,"Count Failed Values":true},{"ID":1001,"Name":"count of cache hit on states cache in provider","Value":"24","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":995,"Name":"number of updated state rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1004,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":986,"Name":"duration","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":959,"Name":"fetch wait time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1085,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":168,"Internal":true,"Count Failed Values":true},{"ID":1007,"Name":"time in aggregation build","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":998,"Name":"time to commit changes","Value":"49","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1084,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":997,"Name":"time to remove","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1087,"Name":"internal.metrics.shuffle.read.recordsRead","Value":1,"Internal":true,"Count Failed Values":true},{"ID":955,"Name":"local blocks read","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1081,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":991,"Name":"avg hash probe bucket list iters","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1009,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1072,"Name":"internal.metrics.executorRunTime","Value":106,"Internal":true,"Count Failed Values":true},{"ID":1000,"Name":"estimated size of state only on current version","Value":"456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":994,"Name":"number of total state rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1003,"Name":"duration","Value":"19","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":958,"Name":"local bytes read","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":988,"Name":"peak memory","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1074,"Name":"internal.metrics.resultSize","Value":10885,"Internal":true,"Count Failed Values":true},{"ID":1083,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":960,"Name":"records read","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1086,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":987,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1005,"Name":"peak memory","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":996,"Name":"time to update","Value":"19","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1071,"Name":"internal.metrics.executorDeserializeCpuTime","Value":8005933,"Internal":true,"Count Failed Values":true},{"ID":999,"Name":"memory used by state","Value":"1184","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":990,"Name":"time in aggregation build","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"}],"Resource Profile Id":0}} +{"Event":"SparkListenerJobEnd","Job ID":6,"Completion Time":1596020225875,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":20,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 6","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nLocalTableScan (1)\n\n\n(1) LocalTableScan\nOutput [2]: [value#130, count#131]\nArguments: [value#130, count#131]\n\n","sparkPlanInfo":{"nodeName":"LocalTableScan","simpleString":"LocalTableScan [value#130, count#131]","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":1095,"metricType":"sum"}]},"time":1596020225891} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":20,"time":1596020225896} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":19,"time":1596020225897} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":18,"time":1596020225897} +{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent","progress":{"id":"8d268dc2-bc9c-4be8-97a9-b135d2943028","runId":"e225d92f-2545-48f8-87a2-9c0309580f8a","name":null,"timestamp":"2020-07-29T10:57:05.562Z","batchId":6,"batchDuration":351,"durationMs":{"triggerExecution":351,"queryPlanning":28,"getBatch":1,"latestOffset":6,"addBatch":273,"walCommit":25},"eventTime":{},"stateOperators":[{"numRowsTotal":1,"numRowsUpdated":1,"memoryUsedBytes":1184,"numLateInputs":0,"customMetrics":{"stateOnCurrentVersionSizeBytes":456,"loadedMapCacheHitCount":24,"loadedMapCacheMissCount":0}}],"sources":[{"description":"KafkaV2[Subscribe[test5]]","startOffset":"{\"test5\":{\"0\":48837}}","endOffset":"{\"test5\":{\"0\":48881}}","numInputRows":44,"inputRowsPerSecond":100.22779043280183,"processedRowsPerSecond":125.35612535612536}],"sink":{"description":"org.apache.spark.sql.execution.streaming.ConsoleTable$@514ba885","numOutputRows":1},"observedMetrics":{}}} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":21,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 7","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nWriteToDataSourceV2 (14)\n+- * HashAggregate (13)\n +- StateStoreSave (12)\n +- * HashAggregate (11)\n +- StateStoreRestore (10)\n +- Exchange (9)\n +- * HashAggregate (8)\n +- * HashAggregate (7)\n +- * SerializeFromObject (6)\n +- MapPartitions (5)\n +- DeserializeToObject (4)\n +- * Project (3)\n +- * Project (2)\n +- MicroBatchScan (1)\n\n\n(1) MicroBatchScan\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nArguments: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@7e7b182c, KafkaV2[Subscribe[test5]], {\"test5\":{\"0\":48881}}, {\"test5\":{\"0\":48917}}\n\n(2) Project [codegen id : 1]\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(3) Project [codegen id : 1]\nOutput [1]: [cast(value#8 as string) AS value#21]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(4) DeserializeToObject\nInput [1]: [value#21]\nArguments: value#21.toString, obj#27: java.lang.String\n\n(5) MapPartitions\nInput [1]: [obj#27]\nArguments: org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String\n\n(6) SerializeFromObject [codegen id : 2]\nInput [1]: [obj#28]\nArguments: [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]\n\n(7) HashAggregate [codegen id : 2]\nInput [1]: [value#29]\nKeys [1]: [value#29]\nFunctions [1]: [partial_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(8) HashAggregate [codegen id : 2]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(9) Exchange\nInput [2]: [value#29, count#38L]\nArguments: hashpartitioning(value#29, 2), true, [id=#1759]\n\n(10) StateStoreRestore\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = c0968891-bf48-4112-a19b-444014085d1d, opId = 0, ver = 0, numPartitions = 2], 2\n\n(11) HashAggregate [codegen id : 3]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(12) StateStoreSave\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = c0968891-bf48-4112-a19b-444014085d1d, opId = 0, ver = 0, numPartitions = 2], Append, 0, 2\n\n(13) HashAggregate [codegen id : 4]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count(1)#31L AS count#32L]\n\n(14) WriteToDataSourceV2\nInput [2]: [value#29, count#32L]\nArguments: org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@6313b68e\n\n","sparkPlanInfo":{"nodeName":"WriteToDataSourceV2","simpleString":"WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@6313b68e","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreSave","simpleString":"StateStoreSave [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 7, numPartitions = 2], Complete, 0, 2","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreRestore","simpleString":"StateStoreRestore [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 7, numPartitions = 2], 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(value#29, 2), true, [id=#1683]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[partial_count(1)])","children":[{"nodeName":"SerializeFromObject","simpleString":"SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MapPartitions","simpleString":"MapPartitions org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String","children":[{"nodeName":"DeserializeToObject","simpleString":"DeserializeToObject value#21.toString, obj#27: java.lang.String","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [cast(value#8 as string) AS value#21]","children":[{"nodeName":"Project","simpleString":"Project [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MicroBatchScan","simpleString":"MicroBatchScan[key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13] class org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":1179,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1178,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1175,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1176,"metricType":"timing"},{"name":"peak memory","accumulatorId":1174,"metricType":"size"},{"name":"number of output rows","accumulatorId":1173,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1177,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1170,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1171,"metricType":"timing"},{"name":"peak memory","accumulatorId":1169,"metricType":"size"},{"name":"number of output rows","accumulatorId":1168,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1172,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1167,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":1119,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":1120,"metricType":"nsTiming"},{"name":"records read","accumulatorId":1117,"metricType":"sum"},{"name":"local bytes read","accumulatorId":1115,"metricType":"size"},{"name":"fetch wait time","accumulatorId":1116,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":1113,"metricType":"size"},{"name":"local blocks read","accumulatorId":1112,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":1111,"metricType":"sum"},{"name":"data size","accumulatorId":1110,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":1114,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":1118,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":1166,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1163,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1164,"metricType":"timing"},{"name":"peak memory","accumulatorId":1162,"metricType":"size"},{"name":"number of output rows","accumulatorId":1161,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1165,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1160,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of inputs which are later than watermark ('inputs' are relative to operators)","accumulatorId":1150,"metricType":"sum"},{"name":"number of total state rows","accumulatorId":1151,"metricType":"sum"},{"name":"memory used by state","accumulatorId":1156,"metricType":"size"},{"name":"count of cache hit on states cache in provider","accumulatorId":1158,"metricType":"sum"},{"name":"number of output rows","accumulatorId":1149,"metricType":"sum"},{"name":"estimated size of state only on current version","accumulatorId":1157,"metricType":"size"},{"name":"count of cache miss on states cache in provider","accumulatorId":1159,"metricType":"sum"},{"name":"time to commit changes","accumulatorId":1155,"metricType":"timing"},{"name":"time to remove","accumulatorId":1154,"metricType":"timing"},{"name":"number of updated state rows","accumulatorId":1152,"metricType":"sum"},{"name":"time to update","accumulatorId":1153,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1146,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1147,"metricType":"timing"},{"name":"peak memory","accumulatorId":1145,"metricType":"size"},{"name":"number of output rows","accumulatorId":1144,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1148,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1143,"metricType":"timing"}]}],"metadata":{},"metrics":[]},"time":1596020225988} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":22,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 7","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nWriteToDataSourceV2 (14)\n+- * HashAggregate (13)\n +- StateStoreSave (12)\n +- * HashAggregate (11)\n +- StateStoreRestore (10)\n +- Exchange (9)\n +- * HashAggregate (8)\n +- * HashAggregate (7)\n +- * SerializeFromObject (6)\n +- MapPartitions (5)\n +- DeserializeToObject (4)\n +- * Project (3)\n +- * Project (2)\n +- MicroBatchScan (1)\n\n\n(1) MicroBatchScan\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nArguments: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan@7e7b182c, KafkaV2[Subscribe[test5]], {\"test5\":{\"0\":48881}}, {\"test5\":{\"0\":48917}}\n\n(2) Project [codegen id : 1]\nOutput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(3) Project [codegen id : 1]\nOutput [1]: [cast(value#8 as string) AS value#21]\nInput [7]: [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n\n(4) DeserializeToObject\nInput [1]: [value#21]\nArguments: value#21.toString, obj#27: java.lang.String\n\n(5) MapPartitions\nInput [1]: [obj#27]\nArguments: org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String\n\n(6) SerializeFromObject [codegen id : 2]\nInput [1]: [obj#28]\nArguments: [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]\n\n(7) HashAggregate [codegen id : 2]\nInput [1]: [value#29]\nKeys [1]: [value#29]\nFunctions [1]: [partial_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(8) HashAggregate [codegen id : 2]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(9) Exchange\nInput [2]: [value#29, count#38L]\nArguments: hashpartitioning(value#29, 2), true, [id=#1835]\n\n(10) StateStoreRestore\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = e165b23b-1a6f-459f-9c51-288922bb2647, opId = 0, ver = 0, numPartitions = 2], 2\n\n(11) HashAggregate [codegen id : 3]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [merge_count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count#38L]\n\n(12) StateStoreSave\nInput [2]: [value#29, count#38L]\nArguments: [value#29], state info [ checkpoint = , runId = e165b23b-1a6f-459f-9c51-288922bb2647, opId = 0, ver = 0, numPartitions = 2], Append, 0, 2\n\n(13) HashAggregate [codegen id : 4]\nInput [2]: [value#29, count#38L]\nKeys [1]: [value#29]\nFunctions [1]: [count(1)]\nAggregate Attributes [1]: [count(1)#31L]\nResults [2]: [value#29, count(1)#31L AS count#32L]\n\n(14) WriteToDataSourceV2\nInput [2]: [value#29, count#32L]\nArguments: org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@6313b68e\n\n","sparkPlanInfo":{"nodeName":"WriteToDataSourceV2","simpleString":"WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@6313b68e","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreSave","simpleString":"StateStoreSave [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 7, numPartitions = 2], Complete, 0, 2","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"StateStoreRestore","simpleString":"StateStoreRestore [value#29], state info [ checkpoint = file:/tmp/temporary-025d7997-5b66-4def-abbf-bdcca57312b9/state, runId = e225d92f-2545-48f8-87a2-9c0309580f8a, opId = 0, ver = 7, numPartitions = 2], 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(value#29, 2), true, [id=#1683]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[merge_count(1)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[value#29], functions=[partial_count(1)])","children":[{"nodeName":"SerializeFromObject","simpleString":"SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#29]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MapPartitions","simpleString":"MapPartitions org.apache.spark.sql.Dataset$$Lambda$1321/872917583@67b99068, obj#28: java.lang.String","children":[{"nodeName":"DeserializeToObject","simpleString":"DeserializeToObject value#21.toString, obj#27: java.lang.String","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [cast(value#8 as string) AS value#21]","children":[{"nodeName":"Project","simpleString":"Project [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"MicroBatchScan","simpleString":"MicroBatchScan[key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13] class org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":1179,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1178,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1175,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1176,"metricType":"timing"},{"name":"peak memory","accumulatorId":1174,"metricType":"size"},{"name":"number of output rows","accumulatorId":1173,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1177,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1170,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1171,"metricType":"timing"},{"name":"peak memory","accumulatorId":1169,"metricType":"size"},{"name":"number of output rows","accumulatorId":1168,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1172,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1167,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":1119,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":1120,"metricType":"nsTiming"},{"name":"records read","accumulatorId":1117,"metricType":"sum"},{"name":"local bytes read","accumulatorId":1115,"metricType":"size"},{"name":"fetch wait time","accumulatorId":1116,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":1113,"metricType":"size"},{"name":"local blocks read","accumulatorId":1112,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":1111,"metricType":"sum"},{"name":"data size","accumulatorId":1110,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":1114,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":1118,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":1166,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1163,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1164,"metricType":"timing"},{"name":"peak memory","accumulatorId":1162,"metricType":"size"},{"name":"number of output rows","accumulatorId":1161,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1165,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1160,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of inputs which are later than watermark ('inputs' are relative to operators)","accumulatorId":1150,"metricType":"sum"},{"name":"number of total state rows","accumulatorId":1151,"metricType":"sum"},{"name":"memory used by state","accumulatorId":1156,"metricType":"size"},{"name":"count of cache hit on states cache in provider","accumulatorId":1158,"metricType":"sum"},{"name":"number of output rows","accumulatorId":1149,"metricType":"sum"},{"name":"estimated size of state only on current version","accumulatorId":1157,"metricType":"size"},{"name":"count of cache miss on states cache in provider","accumulatorId":1159,"metricType":"sum"},{"name":"time to commit changes","accumulatorId":1155,"metricType":"timing"},{"name":"time to remove","accumulatorId":1154,"metricType":"timing"},{"name":"number of updated state rows","accumulatorId":1152,"metricType":"sum"},{"name":"time to update","accumulatorId":1153,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1146,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1147,"metricType":"timing"},{"name":"peak memory","accumulatorId":1145,"metricType":"size"},{"name":"number of output rows","accumulatorId":1144,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1148,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1143,"metricType":"timing"}]}],"metadata":{},"metrics":[]},"time":1596020226019} +{"Event":"SparkListenerJobStart","Job ID":7,"Submission Time":1596020226076,"Stage Infos":[{"Stage ID":15,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":95,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"231\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[94],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":93,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"235\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[92],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":91,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"239\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[90],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":94,"Name":"StateStoreRDD","Scope":"{\"id\":\"234\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[93],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":92,"Name":"StateStoreRDD","Scope":"{\"id\":\"238\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[91],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[14],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Accumulables":[],"Resource Profile Id":0},{"Stage ID":14,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":90,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"239\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[89],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":88,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"245\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[87],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":84,"Name":"DataSourceRDD","Scope":"{\"id\":\"251\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":85,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"251\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[84],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":89,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"240\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[88],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":86,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"247\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[85],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":87,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"246\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[86],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Accumulables":[],"Resource Profile Id":0}],"Stage IDs":[15,14],"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 7","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"7","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"22","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":14,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":90,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"239\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[89],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":88,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"245\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[87],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":84,"Name":"DataSourceRDD","Scope":"{\"id\":\"251\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":85,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"251\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[84],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":89,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"240\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[88],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":86,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"247\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[85],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":87,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"246\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[86],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020226077,"Accumulables":[],"Resource Profile Id":0},"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 7","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"7","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"22","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":14,"Stage Attempt ID":0,"Task Info":{"Task ID":21,"Index":0,"Attempt":0,"Launch Time":1596020226086,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":14,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":21,"Index":0,"Attempt":0,"Launch Time":1596020226086,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020226116,"Failed":false,"Killed":false,"Accumulables":[{"ID":1120,"Name":"shuffle write time","Update":"543034","Value":"543034","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1119,"Name":"shuffle records written","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1118,"Name":"shuffle bytes written","Update":"168","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1110,"Name":"data size","Update":"128","Value":"128","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1167,"Name":"duration","Update":"13","Value":"13","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1168,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1169,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1171,"Name":"time in aggregation build","Update":"8","Value":"8","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1173,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1174,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1176,"Name":"time in aggregation build","Update":"6","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1178,"Name":"duration","Update":"13","Value":"13","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1179,"Name":"number of output rows","Update":"36","Value":"36","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1224,"Name":"internal.metrics.input.recordsRead","Update":36,"Value":36,"Internal":true,"Count Failed Values":true},{"ID":1222,"Name":"internal.metrics.shuffle.write.writeTime","Update":543034,"Value":543034,"Internal":true,"Count Failed Values":true},{"ID":1221,"Name":"internal.metrics.shuffle.write.recordsWritten","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":1220,"Name":"internal.metrics.shuffle.write.bytesWritten","Update":168,"Value":168,"Internal":true,"Count Failed Values":true},{"ID":1211,"Name":"internal.metrics.peakExecutionMemory","Update":524288,"Value":524288,"Internal":true,"Count Failed Values":true},{"ID":1206,"Name":"internal.metrics.resultSize","Update":2544,"Value":2544,"Internal":true,"Count Failed Values":true},{"ID":1205,"Name":"internal.metrics.executorCpuTime","Update":19652237,"Value":19652237,"Internal":true,"Count Failed Values":true},{"ID":1204,"Name":"internal.metrics.executorRunTime","Update":19,"Value":19,"Internal":true,"Count Failed Values":true},{"ID":1203,"Name":"internal.metrics.executorDeserializeCpuTime","Update":2829254,"Value":2829254,"Internal":true,"Count Failed Values":true},{"ID":1202,"Name":"internal.metrics.executorDeserializeTime","Update":2,"Value":2,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":2,"Executor Deserialize CPU Time":2829254,"Executor Run Time":19,"Executor CPU Time":19652237,"Peak Execution Memory":524288,"Result Size":2544,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":168,"Shuffle Write Time":543034,"Shuffle Records Written":1},"Input Metrics":{"Bytes Read":0,"Records Read":36},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":14,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":1,"RDD Info":[{"RDD ID":90,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"239\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[89],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":88,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"245\",\"name\":\"MapPartitions\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[87],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":84,"Name":"DataSourceRDD","Scope":"{\"id\":\"251\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":85,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"251\",\"name\":\"MicroBatchScan\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[84],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":89,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"240\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[88],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":86,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"247\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[85],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":87,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"246\",\"name\":\"DeserializeToObject\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[86],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020226077,"Completion Time":1596020226117,"Accumulables":[{"ID":1205,"Name":"internal.metrics.executorCpuTime","Value":19652237,"Internal":true,"Count Failed Values":true},{"ID":1178,"Name":"duration","Value":"13","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1169,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1118,"Name":"shuffle bytes written","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1204,"Name":"internal.metrics.executorRunTime","Value":19,"Internal":true,"Count Failed Values":true},{"ID":1222,"Name":"internal.metrics.shuffle.write.writeTime","Value":543034,"Internal":true,"Count Failed Values":true},{"ID":1171,"Name":"time in aggregation build","Value":"8","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1120,"Name":"shuffle write time","Value":"543034","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1174,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1168,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1203,"Name":"internal.metrics.executorDeserializeCpuTime","Value":2829254,"Internal":true,"Count Failed Values":true},{"ID":1167,"Name":"duration","Value":"13","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1221,"Name":"internal.metrics.shuffle.write.recordsWritten","Value":1,"Internal":true,"Count Failed Values":true},{"ID":1176,"Name":"time in aggregation build","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1206,"Name":"internal.metrics.resultSize","Value":2544,"Internal":true,"Count Failed Values":true},{"ID":1224,"Name":"internal.metrics.input.recordsRead","Value":36,"Internal":true,"Count Failed Values":true},{"ID":1179,"Name":"number of output rows","Value":"36","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1119,"Name":"shuffle records written","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1110,"Name":"data size","Value":"128","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1173,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1202,"Name":"internal.metrics.executorDeserializeTime","Value":2,"Internal":true,"Count Failed Values":true},{"ID":1211,"Name":"internal.metrics.peakExecutionMemory","Value":524288,"Internal":true,"Count Failed Values":true},{"ID":1220,"Name":"internal.metrics.shuffle.write.bytesWritten","Value":168,"Internal":true,"Count Failed Values":true}],"Resource Profile Id":0}} +{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":15,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":95,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"231\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[94],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":93,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"235\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[92],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":91,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"239\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[90],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":94,"Name":"StateStoreRDD","Scope":"{\"id\":\"234\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[93],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":92,"Name":"StateStoreRDD","Scope":"{\"id\":\"238\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[91],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[14],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020226120,"Accumulables":[],"Resource Profile Id":0},"Properties":{"sql.streaming.queryId":"8d268dc2-bc9c-4be8-97a9-b135d2943028","spark.driver.host":"iZbp19vpr16ix621sdw476Z","spark.eventLog.enabled":"true","spark.sql.adaptive.enabled":"false","spark.job.interruptOnCancel":"true","spark.driver.port":"46309","__fetch_continuous_blocks_in_batch_enabled":"true","spark.jars":"file:/root/spark-3.1.0-SNAPSHOT-bin-hadoop2.8/./examples/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar","__is_continuous_processing":"false","spark.app.name":"StructuredKafkaWordCount","callSite.long":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","callSite.short":"start at StructuredKafkaWordCount.scala:86","spark.submit.pyFiles":"","spark.job.description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 7","spark.executor.id":"driver","spark.sql.cbo.enabled":"false","streaming.sql.batchId":"7","spark.jobGroup.id":"e225d92f-2545-48f8-87a2-9c0309580f8a","spark.submit.deployMode":"client","spark.master":"local[*]","spark.eventLog.dir":"/tmp/spark-history","spark.sql.execution.id":"22","spark.app.id":"local-1596020211915","spark.sql.shuffle.partitions":"2"}} +{"Event":"SparkListenerTaskStart","Stage ID":15,"Stage Attempt ID":0,"Task Info":{"Task ID":22,"Index":0,"Attempt":0,"Launch Time":1596020226128,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskStart","Stage ID":15,"Stage Attempt ID":0,"Task Info":{"Task ID":23,"Index":1,"Attempt":0,"Launch Time":1596020226129,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":15,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":23,"Index":1,"Attempt":0,"Launch Time":1596020226129,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020226196,"Failed":false,"Killed":false,"Accumulables":[{"ID":1143,"Name":"duration","Update":"3","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1148,"Name":"avg hash probe bucket list iters","Update":"10","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1144,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1145,"Name":"peak memory","Update":"4456448","Value":"4456448","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1147,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1153,"Name":"time to update","Update":"21","Value":"21","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1152,"Name":"number of updated state rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1154,"Name":"time to remove","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1155,"Name":"time to commit changes","Update":"19","Value":"19","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1157,"Name":"estimated size of state only on current version","Update":"368","Value":"368","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1149,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1158,"Name":"count of cache hit on states cache in provider","Update":"14","Value":"14","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1156,"Name":"memory used by state","Update":"784","Value":"784","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1151,"Name":"number of total state rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1160,"Name":"duration","Update":"21","Value":"21","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1161,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1162,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1164,"Name":"time in aggregation build","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1166,"Name":"number of output rows","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1112,"Name":"local blocks read","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1116,"Name":"fetch wait time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1115,"Name":"local bytes read","Update":"168","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1117,"Name":"records read","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1244,"Name":"internal.metrics.shuffle.read.recordsRead","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":1243,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1242,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":168,"Value":168,"Internal":true,"Count Failed Values":true},{"ID":1241,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1240,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1239,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":1238,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1236,"Name":"internal.metrics.peakExecutionMemory","Update":4718592,"Value":4718592,"Internal":true,"Count Failed Values":true},{"ID":1231,"Name":"internal.metrics.resultSize","Update":5574,"Value":5574,"Internal":true,"Count Failed Values":true},{"ID":1230,"Name":"internal.metrics.executorCpuTime","Update":19415818,"Value":19415818,"Internal":true,"Count Failed Values":true},{"ID":1229,"Name":"internal.metrics.executorRunTime","Update":60,"Value":60,"Internal":true,"Count Failed Values":true},{"ID":1228,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3845429,"Value":3845429,"Internal":true,"Count Failed Values":true},{"ID":1227,"Name":"internal.metrics.executorDeserializeTime","Update":3,"Value":3,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":3,"Executor Deserialize CPU Time":3845429,"Executor Run Time":60,"Executor CPU Time":19415818,"Peak Execution Memory":4718592,"Result Size":5574,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":1,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":168,"Total Records Read":1},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerTaskEnd","Stage ID":15,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":22,"Index":0,"Attempt":0,"Launch Time":1596020226128,"Executor ID":"driver","Host":"iZbp19vpr16ix621sdw476Z","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1596020226204,"Failed":false,"Killed":false,"Accumulables":[{"ID":1143,"Name":"duration","Update":"2","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1145,"Name":"peak memory","Update":"262144","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1147,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1153,"Name":"time to update","Update":"3","Value":"24","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1154,"Name":"time to remove","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1155,"Name":"time to commit changes","Update":"48","Value":"67","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1157,"Name":"estimated size of state only on current version","Update":"88","Value":"456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1158,"Name":"count of cache hit on states cache in provider","Update":"14","Value":"28","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1156,"Name":"memory used by state","Update":"400","Value":"1184","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1160,"Name":"duration","Update":"3","Value":"24","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1162,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1164,"Name":"time in aggregation build","Update":"0","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1244,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":1243,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1242,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":168,"Internal":true,"Count Failed Values":true},{"ID":1241,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1240,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1239,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":1238,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1236,"Name":"internal.metrics.peakExecutionMemory","Update":524288,"Value":5242880,"Internal":true,"Count Failed Values":true},{"ID":1231,"Name":"internal.metrics.resultSize","Update":5311,"Value":10885,"Internal":true,"Count Failed Values":true},{"ID":1230,"Name":"internal.metrics.executorCpuTime","Update":14652861,"Value":34068679,"Internal":true,"Count Failed Values":true},{"ID":1229,"Name":"internal.metrics.executorRunTime","Update":65,"Value":125,"Internal":true,"Count Failed Values":true},{"ID":1228,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3933877,"Value":7779306,"Internal":true,"Count Failed Values":true},{"ID":1227,"Name":"internal.metrics.executorDeserializeTime","Update":3,"Value":6,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":3,"Executor Deserialize CPU Time":3933877,"Executor Run Time":65,"Executor CPU Time":14652861,"Peak Execution Memory":524288,"Result Size":5311,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} +{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":15,"Stage Attempt ID":0,"Stage Name":"start at StructuredKafkaWordCount.scala:86","Number of Tasks":2,"RDD Info":[{"RDD ID":95,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"231\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[94],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":93,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"235\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[92],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":91,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"239\",\"name\":\"Exchange\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[90],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":94,"Name":"StateStoreRDD","Scope":"{\"id\":\"234\",\"name\":\"StateStoreSave\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[93],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":92,"Name":"StateStoreRDD","Scope":"{\"id\":\"238\",\"name\":\"StateStoreRestore\"}","Callsite":"start at StructuredKafkaWordCount.scala:86","Parent IDs":[91],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":2,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[14],"Details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","Submission Time":1596020226120,"Completion Time":1596020226204,"Accumulables":[{"ID":1115,"Name":"local bytes read","Value":"168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1160,"Name":"duration","Value":"24","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1151,"Name":"number of total state rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1145,"Name":"peak memory","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1154,"Name":"time to remove","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1231,"Name":"internal.metrics.resultSize","Value":10885,"Internal":true,"Count Failed Values":true},{"ID":1240,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1153,"Name":"time to update","Value":"24","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1162,"Name":"peak memory","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1144,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1243,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1117,"Name":"records read","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1228,"Name":"internal.metrics.executorDeserializeCpuTime","Value":7779306,"Internal":true,"Count Failed Values":true},{"ID":1147,"Name":"time in aggregation build","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1156,"Name":"memory used by state","Value":"1184","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1236,"Name":"internal.metrics.peakExecutionMemory","Value":5242880,"Internal":true,"Count Failed Values":true},{"ID":1227,"Name":"internal.metrics.executorDeserializeTime","Value":6,"Internal":true,"Count Failed Values":true},{"ID":1158,"Name":"count of cache hit on states cache in provider","Value":"28","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1149,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1239,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":1,"Internal":true,"Count Failed Values":true},{"ID":1230,"Name":"internal.metrics.executorCpuTime","Value":34068679,"Internal":true,"Count Failed Values":true},{"ID":1152,"Name":"number of updated state rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1242,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":168,"Internal":true,"Count Failed Values":true},{"ID":1116,"Name":"fetch wait time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1161,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1143,"Name":"duration","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1155,"Name":"time to commit changes","Value":"67","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1164,"Name":"time in aggregation build","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1241,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1244,"Name":"internal.metrics.shuffle.read.recordsRead","Value":1,"Internal":true,"Count Failed Values":true},{"ID":1148,"Name":"avg hash probe bucket list iters","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1157,"Name":"estimated size of state only on current version","Value":"456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1166,"Name":"number of output rows","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1238,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1229,"Name":"internal.metrics.executorRunTime","Value":125,"Internal":true,"Count Failed Values":true},{"ID":1112,"Name":"local blocks read","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"}],"Resource Profile Id":0}} +{"Event":"SparkListenerJobEnd","Job ID":7,"Completion Time":1596020226204,"Job Result":{"Result":"JobSucceeded"}} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":23,"description":"\nid = 8d268dc2-bc9c-4be8-97a9-b135d2943028\nrunId = e225d92f-2545-48f8-87a2-9c0309580f8a\nbatch = 7","details":"org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:366)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount$.main(StructuredKafkaWordCount.scala:86)\norg.apache.spark.examples.sql.streaming.StructuredKafkaWordCount.main(StructuredKafkaWordCount.scala)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\norg.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)\norg.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:934)\norg.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)\norg.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)\norg.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)\norg.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1013)\norg.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1022)\norg.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)","physicalPlanDescription":"== Physical Plan ==\nLocalTableScan (1)\n\n\n(1) LocalTableScan\nOutput [2]: [value#144, count#145]\nArguments: [value#144, count#145]\n\n","sparkPlanInfo":{"nodeName":"LocalTableScan","simpleString":"LocalTableScan [value#144, count#145]","children":[],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":1252,"metricType":"sum"}]},"time":1596020226221} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":23,"time":1596020226230} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":22,"time":1596020226231} +{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":21,"time":1596020226231} +{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent","progress":{"id":"8d268dc2-bc9c-4be8-97a9-b135d2943028","runId":"e225d92f-2545-48f8-87a2-9c0309580f8a","name":null,"timestamp":"2020-07-29T10:57:05.916Z","batchId":7,"batchDuration":341,"durationMs":{"triggerExecution":341,"queryPlanning":24,"getBatch":0,"latestOffset":3,"addBatch":271,"walCommit":14},"eventTime":{},"stateOperators":[{"numRowsTotal":1,"numRowsUpdated":1,"memoryUsedBytes":1184,"numLateInputs":0,"customMetrics":{"stateOnCurrentVersionSizeBytes":456,"loadedMapCacheHitCount":28,"loadedMapCacheMissCount":0}}],"sources":[{"description":"KafkaV2[Subscribe[test5]]","startOffset":"{\"test5\":{\"0\":48881}}","endOffset":"{\"test5\":{\"0\":48917}}","numInputRows":36,"inputRowsPerSecond":101.69491525423729,"processedRowsPerSecond":105.57184750733137}],"sink":{"description":"org.apache.spark.sql.execution.streaming.ConsoleTable$@514ba885","numOutputRows":1},"observedMetrics":{}}} +{"Event":"SparkListenerApplicationEnd","Timestamp":1596020226301} diff --git a/sql/core/src/test/scala/org/apache/spark/deploy/history/Utils.scala b/sql/core/src/test/scala/org/apache/spark/deploy/history/Utils.scala new file mode 100644 index 0000000000000..f73305b1b001e --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/deploy/history/Utils.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.history + +import org.apache.spark.SparkConf +import org.apache.spark.internal.config.History.HISTORY_LOG_DIR +import org.apache.spark.util.ManualClock + +object Utils { + def withFsHistoryProvider(logDir: String)(fn: FsHistoryProvider => Unit): Unit = { + var provider: FsHistoryProvider = null + try { + val clock = new ManualClock() + val conf = new SparkConf().set(HISTORY_LOG_DIR, logDir) + val provider = new FsHistoryProvider(conf, clock) + provider.checkForLogs() + fn(provider) + } finally { + if (provider != null) { + provider.stop() + provider = null + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryHistorySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryHistorySuite.scala new file mode 100644 index 0000000000000..160535ea4d048 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryHistorySuite.scala @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.streaming.ui + +import java.util.Locale +import javax.servlet.http.HttpServletRequest + +import org.mockito.Mockito.{mock, when} +import org.scalatest.BeforeAndAfter + +import org.apache.spark.deploy.history.{Utils => HsUtils} +import org.apache.spark.sql.execution.ui.StreamingQueryStatusStore +import org.apache.spark.sql.test.SharedSparkSession + +class StreamingQueryHistorySuite extends SharedSparkSession with BeforeAndAfter { + + test("support streaming query events") { + val logDir = Thread.currentThread().getContextClassLoader.getResource("spark-events").toString + HsUtils.withFsHistoryProvider(logDir) { provider => + val appUi = provider.getAppUI("local-1596020211915", None).getOrElse { + assert(false, "Failed to load event log of local-1596020211915.") + null + } + assert(appUi.ui.appName == "StructuredKafkaWordCount") + assert(appUi.ui.store.store.count(classOf[StreamingQueryData]) == 1) + assert(appUi.ui.store.store.count(classOf[StreamingQueryProgressWrapper]) == 8) + + val store = new StreamingQueryStatusStore(appUi.ui.store.store) + val tab = new StreamingQueryTab(store, appUi.ui) + val request = mock(classOf[HttpServletRequest]) + var html = new StreamingQueryPage(tab).render(request) + .toString().toLowerCase(Locale.ROOT) + // 81.39: Avg Input /sec + assert(html.contains("81.39")) + // 157.05: Avg Process /sec + assert(html.contains("157.05")) + + val id = "8d268dc2-bc9c-4be8-97a9-b135d2943028" + val runId = "e225d92f-2545-48f8-87a2-9c0309580f8a" + when(request.getParameter("id")).thenReturn(runId) + html = new StreamingQueryStatisticsPage(tab).render(request) + .toString().toLowerCase(Locale.ROOT) + assert(html.contains("8 completed batches")) + assert(html.contains(id)) + assert(html.contains(runId)) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPageSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPageSuite.scala index c2b6688faf0e7..246fa1f7c9184 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPageSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPageSuite.scala @@ -20,11 +20,13 @@ package org.apache.spark.sql.streaming.ui import java.util.{Locale, UUID} import javax.servlet.http.HttpServletRequest +import scala.xml.Node + import org.mockito.Mockito.{mock, when, RETURNS_SMART_NULLS} import org.scalatest.BeforeAndAfter -import scala.xml.Node import org.apache.spark.SparkConf +import org.apache.spark.sql.execution.ui.StreamingQueryStatusStore import org.apache.spark.sql.streaming.StreamingQueryProgress import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.ui.SparkUI @@ -35,26 +37,26 @@ class StreamingQueryPageSuite extends SharedSparkSession with BeforeAndAfter { val id = UUID.randomUUID() val request = mock(classOf[HttpServletRequest]) val tab = mock(classOf[StreamingQueryTab], RETURNS_SMART_NULLS) - val statusListener = mock(classOf[StreamingQueryStatusListener], RETURNS_SMART_NULLS) + val store = mock(classOf[StreamingQueryStatusStore], RETURNS_SMART_NULLS) when(tab.appName).thenReturn("testing") when(tab.headerTabs).thenReturn(Seq.empty) - when(tab.statusListener).thenReturn(statusListener) + when(tab.store).thenReturn(store) val streamQuery = createStreamQueryUIData(id) - when(statusListener.allQueryStatus).thenReturn(Seq(streamQuery)) + when(store.allQueryUIData).thenReturn(Seq(streamQuery)) var html = renderStreamingQueryPage(request, tab) .toString().toLowerCase(Locale.ROOT) assert(html.contains("active streaming queries (1)")) - when(streamQuery.isActive).thenReturn(false) - when(streamQuery.exception).thenReturn(None) + when(streamQuery.summary.isActive).thenReturn(false) + when(streamQuery.summary.exception).thenReturn(None) html = renderStreamingQueryPage(request, tab) .toString().toLowerCase(Locale.ROOT) assert(html.contains("completed streaming queries (1)")) assert(html.contains("finished")) - when(streamQuery.isActive).thenReturn(false) - when(streamQuery.exception).thenReturn(Option("exception in query")) + when(streamQuery.summary.isActive).thenReturn(false) + when(streamQuery.summary.exception).thenReturn(Option("exception in query")) html = renderStreamingQueryPage(request, tab) .toString().toLowerCase(Locale.ROOT) assert(html.contains("completed streaming queries (1)")) @@ -66,17 +68,20 @@ class StreamingQueryPageSuite extends SharedSparkSession with BeforeAndAfter { val id = UUID.randomUUID() val request = mock(classOf[HttpServletRequest]) val tab = mock(classOf[StreamingQueryTab], RETURNS_SMART_NULLS) - val statusListener = mock(classOf[StreamingQueryStatusListener], RETURNS_SMART_NULLS) + val store = mock(classOf[StreamingQueryStatusStore], RETURNS_SMART_NULLS) + when(request.getParameter("id")).thenReturn(id.toString) + when(tab.appName).thenReturn("testing") + when(tab.headerTabs).thenReturn(Seq.empty) + when(tab.store).thenReturn(store) val ui = mock(classOf[SparkUI]) when(request.getParameter("id")).thenReturn(id.toString) when(tab.appName).thenReturn("testing") when(tab.headerTabs).thenReturn(Seq.empty) - when(tab.statusListener).thenReturn(statusListener) when(ui.conf).thenReturn(new SparkConf()) when(tab.parent).thenReturn(ui) val streamQuery = createStreamQueryUIData(id) - when(statusListener.allQueryStatus).thenReturn(Seq(streamQuery)) + when(store.allQueryUIData).thenReturn(Seq(streamQuery)) val html = renderStreamingQueryStatisticsPage(request, tab) .toString().toLowerCase(Locale.ROOT) @@ -94,15 +99,18 @@ class StreamingQueryPageSuite extends SharedSparkSession with BeforeAndAfter { when(progress.batchId).thenReturn(2) when(progress.prettyJson).thenReturn("""{"a":1}""") + val summary = mock(classOf[StreamingQueryData], RETURNS_SMART_NULLS) + when(summary.isActive).thenReturn(true) + when(summary.name).thenReturn("query") + when(summary.id).thenReturn(id) + when(summary.runId).thenReturn(id) + when(summary.startTimestamp).thenReturn(1L) + when(summary.exception).thenReturn(None) + val streamQuery = mock(classOf[StreamingQueryUIData], RETURNS_SMART_NULLS) - when(streamQuery.isActive).thenReturn(true) - when(streamQuery.name).thenReturn("query") - when(streamQuery.id).thenReturn(id) - when(streamQuery.runId).thenReturn(id) - when(streamQuery.startTimestamp).thenReturn(1L) + when(streamQuery.summary).thenReturn(summary) when(streamQuery.lastProgress).thenReturn(progress) when(streamQuery.recentProgress).thenReturn(Array(progress)) - when(streamQuery.exception).thenReturn(None) streamQuery } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListenerSuite.scala index 6aa440e5609c5..91c55d5598a6b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListenerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListenerSuite.scala @@ -17,19 +17,28 @@ package org.apache.spark.sql.streaming.ui -import java.util.UUID +import java.text.SimpleDateFormat +import java.util.{Date, UUID} import org.mockito.Mockito.{mock, when, RETURNS_SMART_NULLS} +import org.scalatest.time.SpanSugar._ +import org.apache.spark.sql.catalyst.util.DateTimeUtils.getTimeZone +import org.apache.spark.sql.execution.ui.StreamingQueryStatusStore +import org.apache.spark.sql.internal.StaticSQLConf import org.apache.spark.sql.streaming.{StreamingQueryListener, StreamingQueryProgress, StreamTest} import org.apache.spark.sql.streaming +import org.apache.spark.status.ElementTrackingStore +import org.apache.spark.util.kvstore.InMemoryStore class StreamingQueryStatusListenerSuite extends StreamTest { test("onQueryStarted, onQueryProgress, onQueryTerminated") { - val listener = new StreamingQueryStatusListener(spark.sparkContext.conf) + val kvStore = new ElementTrackingStore(new InMemoryStore(), sparkConf) + val listener = new StreamingQueryStatusListener(spark.sparkContext.conf, kvStore) + val queryStore = new StreamingQueryStatusStore(kvStore) - // hanlde query started event + // handle query started event val id = UUID.randomUUID() val runId = UUID.randomUUID() val startEvent = new StreamingQueryListener.QueryStartedEvent( @@ -37,8 +46,9 @@ class StreamingQueryStatusListenerSuite extends StreamTest { listener.onQueryStarted(startEvent) // result checking - assert(listener.activeQueryStatus.size() == 1) - assert(listener.activeQueryStatus.get(runId).name == "test") + assert(queryStore.allQueryUIData.count(_.summary.isActive) == 1) + assert(queryStore.allQueryUIData.filter(_.summary.isActive).exists(uiData => + uiData.summary.runId == runId && uiData.summary.name.equals("test"))) // handle query progress event val progress = mock(classOf[StreamingQueryProgress], RETURNS_SMART_NULLS) @@ -53,28 +63,32 @@ class StreamingQueryStatusListenerSuite extends StreamTest { listener.onQueryProgress(processEvent) // result checking - val activeQuery = listener.activeQueryStatus.get(runId) - assert(activeQuery.isActive) - assert(activeQuery.recentProgress.length == 1) - assert(activeQuery.lastProgress.id == id) - assert(activeQuery.lastProgress.runId == runId) - assert(activeQuery.lastProgress.timestamp == "2001-10-01T01:00:00.100Z") - assert(activeQuery.lastProgress.inputRowsPerSecond == 10.0) - assert(activeQuery.lastProgress.processedRowsPerSecond == 12.0) - assert(activeQuery.lastProgress.batchId == 2) - assert(activeQuery.lastProgress.prettyJson == """{"a":1}""") + val activeQuery = + queryStore.allQueryUIData.filter(_.summary.isActive).find(_.summary.runId == runId) + assert(activeQuery.isDefined) + assert(activeQuery.get.summary.isActive) + assert(activeQuery.get.recentProgress.length == 1) + assert(activeQuery.get.lastProgress.id == id) + assert(activeQuery.get.lastProgress.runId == runId) + assert(activeQuery.get.lastProgress.timestamp == "2001-10-01T01:00:00.100Z") + assert(activeQuery.get.lastProgress.inputRowsPerSecond == 10.0) + assert(activeQuery.get.lastProgress.processedRowsPerSecond == 12.0) + assert(activeQuery.get.lastProgress.batchId == 2) + assert(activeQuery.get.lastProgress.prettyJson == """{"a":1}""") // handle terminate event val terminateEvent = new StreamingQueryListener.QueryTerminatedEvent(id, runId, None) listener.onQueryTerminated(terminateEvent) - assert(!listener.inactiveQueryStatus.head.isActive) - assert(listener.inactiveQueryStatus.head.runId == runId) - assert(listener.inactiveQueryStatus.head.id == id) + assert(!queryStore.allQueryUIData.filterNot(_.summary.isActive).head.summary.isActive) + assert(queryStore.allQueryUIData.filterNot(_.summary.isActive).head.summary.runId == runId) + assert(queryStore.allQueryUIData.filterNot(_.summary.isActive).head.summary.id == id) } test("same query start multiple times") { - val listener = new StreamingQueryStatusListener(spark.sparkContext.conf) + val kvStore = new ElementTrackingStore(new InMemoryStore(), sparkConf) + val listener = new StreamingQueryStatusListener(spark.sparkContext.conf, kvStore) + val queryStore = new StreamingQueryStatusStore(kvStore) // handle first time start val id = UUID.randomUUID() @@ -94,11 +108,106 @@ class StreamingQueryStatusListenerSuite extends StreamTest { listener.onQueryStarted(startEvent1) // result checking - assert(listener.activeQueryStatus.size() == 1) - assert(listener.inactiveQueryStatus.length == 1) - assert(listener.activeQueryStatus.containsKey(runId1)) - assert(listener.activeQueryStatus.get(runId1).id == id) - assert(listener.inactiveQueryStatus.head.runId == runId0) - assert(listener.inactiveQueryStatus.head.id == id) + assert(queryStore.allQueryUIData.count(_.summary.isActive) == 1) + assert(queryStore.allQueryUIData.filterNot(_.summary.isActive).length == 1) + assert(queryStore.allQueryUIData.filter(_.summary.isActive).exists(_.summary.runId == runId1)) + assert(queryStore.allQueryUIData.filter(_.summary.isActive).exists(uiData => + uiData.summary.runId == runId1 && uiData.summary.id == id)) + assert(queryStore.allQueryUIData.filterNot(_.summary.isActive).head.summary.runId == runId0) + assert(queryStore.allQueryUIData.filterNot(_.summary.isActive).head.summary.id == id) + } + + test("test small retained queries") { + val kvStore = new ElementTrackingStore(new InMemoryStore(), sparkConf) + val conf = spark.sparkContext.conf + conf.set(StaticSQLConf.STREAMING_UI_RETAINED_QUERIES.key, "2") + val listener = new StreamingQueryStatusListener(conf, kvStore) + val queryStore = new StreamingQueryStatusStore(kvStore) + + def addNewQuery(): (UUID, UUID) = { + val format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601 + format.setTimeZone(getTimeZone("UTC")) + val id = UUID.randomUUID() + val runId = UUID.randomUUID() + val startEvent = new StreamingQueryListener.QueryStartedEvent( + id, runId, "test1", format.format(new Date(System.currentTimeMillis()))) + listener.onQueryStarted(startEvent) + (id, runId) + } + + def checkInactiveQueryStatus(numInactives: Int, targetInactives: Seq[UUID]): Unit = { + eventually(timeout(10.seconds)) { + val inactiveQueries = queryStore.allQueryUIData.filter(!_.summary.isActive) + assert(inactiveQueries.size == numInactives) + assert(inactiveQueries.map(_.summary.id).toSet == targetInactives.toSet) + } + } + + val (id1, runId1) = addNewQuery() + val (id2, runId2) = addNewQuery() + val (id3, runId3) = addNewQuery() + assert(queryStore.allQueryUIData.count(!_.summary.isActive) == 0) + + val terminateEvent1 = new StreamingQueryListener.QueryTerminatedEvent(id1, runId1, None) + listener.onQueryTerminated(terminateEvent1) + checkInactiveQueryStatus(1, Seq(id1)) + val terminateEvent2 = new StreamingQueryListener.QueryTerminatedEvent(id2, runId2, None) + listener.onQueryTerminated(terminateEvent2) + checkInactiveQueryStatus(2, Seq(id1, id2)) + val terminateEvent3 = new StreamingQueryListener.QueryTerminatedEvent(id3, runId3, None) + listener.onQueryTerminated(terminateEvent3) + checkInactiveQueryStatus(2, Seq(id2, id3)) + } + + test("test small retained progress") { + val kvStore = new ElementTrackingStore(new InMemoryStore(), sparkConf) + val conf = spark.sparkContext.conf + conf.set(StaticSQLConf.STREAMING_UI_RETAINED_PROGRESS_UPDATES.key, "5") + val listener = new StreamingQueryStatusListener(conf, kvStore) + val queryStore = new StreamingQueryStatusStore(kvStore) + + val id = UUID.randomUUID() + val runId = UUID.randomUUID() + val startEvent = new StreamingQueryListener.QueryStartedEvent( + id, runId, "test", "2016-12-05T20:54:20.827Z") + listener.onQueryStarted(startEvent) + + var batchId: Int = 0 + + def addQueryProgress(): Unit = { + val progress = mockProgressData(id, runId) + val processEvent = new streaming.StreamingQueryListener.QueryProgressEvent(progress) + listener.onQueryProgress(processEvent) + } + + def mockProgressData(id: UUID, runId: UUID): StreamingQueryProgress = { + val format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601 + format.setTimeZone(getTimeZone("UTC")) + + val progress = mock(classOf[StreamingQueryProgress], RETURNS_SMART_NULLS) + when(progress.id).thenReturn(id) + when(progress.runId).thenReturn(runId) + when(progress.timestamp).thenReturn(format.format(new Date(System.currentTimeMillis()))) + when(progress.inputRowsPerSecond).thenReturn(10.0) + when(progress.processedRowsPerSecond).thenReturn(12.0) + when(progress.batchId).thenReturn(batchId) + when(progress.prettyJson).thenReturn("""{"a":1}""") + + batchId += 1 + progress + } + + def checkQueryProcessData(targetNum: Int): Unit = { + eventually(timeout(10.seconds)) { + assert(queryStore.getQueryProgressData(runId).size == targetNum) + } + } + + Array.tabulate(4) { _ => addQueryProgress() } + checkQueryProcessData(4) + addQueryProgress() + checkQueryProcessData(5) + addQueryProgress() + checkQueryProcessData(5) } } From 90d4d7d43ffd29ad780dc7c5588b7e55a73aba97 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 3 Dec 2020 09:31:46 +0800 Subject: [PATCH 085/150] [SPARK-33610][ML] Imputer transform skip duplicate head() job ### What changes were proposed in this pull request? on each call of `transform`, a head() job will be triggered, which can be skipped by using a lazy var. ### Why are the changes needed? avoiding duplicate head() jobs ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? existing tests Closes #30550 from zhengruifeng/imputer_transform. Authored-by: Ruifeng Zheng Signed-off-by: Ruifeng Zheng --- .../org/apache/spark/ml/feature/Imputer.scala | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 03ebe0299f63f..d0b6ab1ef2cbc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -254,20 +254,25 @@ class ImputerModel private[ml] ( /** @group setParam */ def setOutputCols(value: Array[String]): this.type = set(outputCols, value) + @transient private lazy val surrogates = { + val row = surrogateDF.head() + row.schema.fieldNames.zipWithIndex + .map { case (name, index) => (name, row.getDouble(index)) } + .toMap + } + override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) - val (inputColumns, outputColumns) = getInOutCols - val surrogates = surrogateDF.select(inputColumns.map(col): _*).head().toSeq - - - val newCols = inputColumns.zip(outputColumns).zip(surrogates).map { - case ((inputCol, outputCol), surrogate) => - val inputType = dataset.schema(inputCol).dataType - val ic = col(inputCol).cast(DoubleType) - when(ic.isNull, surrogate) - .when(ic === $(missingValue), surrogate) - .otherwise(ic) - .cast(inputType) + val (inputColumns, outputColumns) = getInOutCols() + + val newCols = inputColumns.map { inputCol => + val surrogate = surrogates(inputCol) + val inputType = dataset.schema(inputCol).dataType + val ic = col(inputCol).cast(DoubleType) + when(ic.isNull, surrogate) + .when(ic === $(missingValue), surrogate) + .otherwise(ic) + .cast(inputType) } dataset.withColumns(outputColumns, newCols).toDF() } From 878cc0e6e95f300a0a58c742654f53a28b30b174 Mon Sep 17 00:00:00 2001 From: Yuanjian Li Date: Wed, 2 Dec 2020 17:36:25 -0800 Subject: [PATCH 086/150] [SPARK-32896][SS][FOLLOW-UP] Rename the API to `toTable` ### What changes were proposed in this pull request? As the discussion in https://github.com/apache/spark/pull/30521#discussion_r531463427, rename the API to `toTable`. ### Why are the changes needed? Rename the API for further extension and accuracy. ### Does this PR introduce _any_ user-facing change? Yes, it's an API change but the new API is not released yet. ### How was this patch tested? Existing UT. Closes #30571 from xuanyuanking/SPARK-32896-follow. Authored-by: Yuanjian Li Signed-off-by: Shixiong Zhu --- .../scala/org/apache/spark/sql/streaming/DataStreamWriter.scala | 2 +- .../spark/sql/streaming/test/DataStreamTableAPISuite.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala index d67e175c24dd9..9e3599712fde5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala @@ -304,7 +304,7 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { * @since 3.1.0 */ @throws[TimeoutException] - def saveAsTable(tableName: String): StreamingQuery = { + def toTable(tableName: String): StreamingQuery = { this.source = SOURCE_NAME_TABLE this.tableName = tableName startInternal(None) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamTableAPISuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamTableAPISuite.scala index 062b1060bc601..bf850432d5c0e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamTableAPISuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamTableAPISuite.scala @@ -291,7 +291,7 @@ class DataStreamTableAPISuite extends StreamTest with BeforeAndAfter { val query = inputDF .writeStream .option("checkpointLocation", checkpointDir.getAbsolutePath) - .saveAsTable(tableIdentifier) + .toTable(tableIdentifier) inputData.addData(newInputs: _*) From 08809897554a48065c2280c709d7efba28fa441d Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 3 Dec 2020 10:57:14 +0900 Subject: [PATCH 087/150] [SPARK-22798][PYTHON][ML][FOLLOWUP] Add labelsArray to PySpark StringIndexer ### What changes were proposed in this pull request? This is a followup to add missing `labelsArray` to PySpark `StringIndexer`. ### Why are the changes needed? `labelsArray` is for multi-column case for `StringIndexer`. We should provide this accessor at PySpark side too. ### Does this PR introduce _any_ user-facing change? Yes, `labelsArray` was missing in PySpark `StringIndexer` in Spark 3.0. ### How was this patch tested? Unit test. Closes #30579 from viirya/SPARK-22798-followup. Authored-by: Liang-Chi Hsieh Signed-off-by: HyukjinKwon --- python/pyspark/ml/feature.py | 12 ++++++++++++ python/pyspark/ml/tests/test_feature.py | 1 + 2 files changed, 13 insertions(+) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 8138f34d7a19e..7cfeabea4aa97 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -3852,9 +3852,21 @@ def from_arrays_of_labels(cls, arrayOfLabels, inputCols, outputCols=None, def labels(self): """ Ordered list of labels, corresponding to indices to be assigned. + + .. deprecated:: 3.1.0 + It will be removed in future versions. Use `labelsArray` method instead. """ return self._call_java("labels") + @property + @since("3.1.0") + def labelsArray(self): + """ + Array of ordered list of labels, corresponding to indices to be assigned + for each input column. + """ + return self._call_java("labelsArray") + @inherit_doc class IndexToString(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): diff --git a/python/pyspark/ml/tests/test_feature.py b/python/pyspark/ml/tests/test_feature.py index 98b8ce6dfb95c..2cceb04338806 100644 --- a/python/pyspark/ml/tests/test_feature.py +++ b/python/pyspark/ml/tests/test_feature.py @@ -232,6 +232,7 @@ def test_string_indexer_from_labels(self): model = StringIndexerModel.from_labels(["a", "b", "c"], inputCol="label", outputCol="indexed", handleInvalid="keep") self.assertEqual(model.labels, ["a", "b", "c"]) + self.assertEqual(model.labelsArray, [("a", "b", "c")]) df1 = self.spark.createDataFrame([ (0, "a"), From 3b2ff16ee6e457daade0ecb9f96955c8ed73f2a5 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 3 Dec 2020 14:34:44 +0900 Subject: [PATCH 088/150] [SPARK-33636][PYTHON][ML][FOLLOWUP] Update since tag of labelsArray in StringIndexer ### What changes were proposed in this pull request? This is to update `labelsArray`'s since tag. ### Why are the changes needed? The original change was backported to branch-3.0 for 3.0.2 version. So it is better to update the since tag to reflect the fact. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? N/A. Just tag change. Closes #30582 from viirya/SPARK-33636-followup. Authored-by: Liang-Chi Hsieh Signed-off-by: HyukjinKwon --- python/pyspark/ml/feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 7cfeabea4aa97..546c46383d340 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -3859,7 +3859,7 @@ def labels(self): return self._call_java("labels") @property - @since("3.1.0") + @since("3.0.2") def labelsArray(self): """ Array of ordered list of labels, corresponding to indices to be assigned From ff13f574e67ff9e2c38167368dc6190455e8ed7f Mon Sep 17 00:00:00 2001 From: Gengliang Wang Date: Thu, 3 Dec 2020 14:04:08 +0000 Subject: [PATCH 089/150] [SPARK-20044][SQL] Add new function DATE_FROM_UNIX_DATE and UNIX_DATE ### What changes were proposed in this pull request? Add new functions DATE_FROM_UNIX_DATE and UNIX_DATE for conversion between Date type and Numeric types. ### Why are the changes needed? 1. Explicit conversion between Date type and Numeric types is disallowed in ANSI mode. We need to provide new functions for users to complete the conversion. 2. We have introduced new functions from Bigquery for conversion between Timestamp type and Numeric types: TIMESTAMP_SECONDS, TIMESTAMP_MILLIS, TIMESTAMP_MICROS , UNIX_SECONDS, UNIX_MILLIS, and UNIX_MICROS. It makes sense to add functions for conversion between Date type and Numeric types as well. ### Does this PR introduce _any_ user-facing change? Yes, two new datetime functions are added. ### How was this patch tested? Unit tests Closes #30588 from gengliangwang/dateToNumber. Authored-by: Gengliang Wang Signed-off-by: Wenchen Fan --- .../catalyst/analysis/FunctionRegistry.scala | 2 + .../expressions/datetimeExpressions.scala | 46 +++++++++++++++++++ .../expressions/DateExpressionsSuite.scala | 24 ++++++++++ .../sql-functions/sql-expression-schema.md | 4 +- .../resources/sql-tests/inputs/datetime.sql | 5 +- .../sql-tests/results/ansi/datetime.sql.out | 18 +++++++- .../sql-tests/results/datetime-legacy.sql.out | 18 +++++++- .../sql-tests/results/datetime.sql.out | 18 +++++++- 8 files changed, 130 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 5c2816a0baa95..3b46de539ce3d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -427,6 +427,8 @@ object FunctionRegistry { expression[MakeInterval]("make_interval"), expression[DatePart]("date_part"), expression[Extract]("extract"), + expression[DateFromUnixDate]("date_from_unix_date"), + expression[UnixDate]("unix_date"), expression[SecondsToTimestamp]("timestamp_seconds"), expression[MillisToTimestamp]("timestamp_millis"), expression[MicrosToTimestamp]("timestamp_micros"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 60dc32c1571fe..c20dd6148be3e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -400,6 +400,52 @@ case class DayOfYear(child: Expression) extends GetDateField { override val funcName = "getDayInYear" } +@ExpressionDescription( + usage = "_FUNC_(days) - Create date from the number of days since 1970-01-01.", + examples = """ + Examples: + > SELECT _FUNC_(1); + 1970-01-02 + """, + group = "datetime_funcs", + since = "3.1.0") +case class DateFromUnixDate(child: Expression) extends UnaryExpression + with ImplicitCastInputTypes with NullIntolerant { + override def inputTypes: Seq[AbstractDataType] = Seq(IntegerType) + + override def dataType: DataType = DateType + + override def nullSafeEval(input: Any): Any = input.asInstanceOf[Int] + + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = + defineCodeGen(ctx, ev, c => c) + + override def prettyName: String = "date_from_unix_date" +} + +@ExpressionDescription( + usage = "_FUNC_(date) - Returns the number of days since 1970-01-01.", + examples = """ + Examples: + > SELECT _FUNC_(DATE("1970-01-02")); + 1 + """, + group = "datetime_funcs", + since = "3.1.0") +case class UnixDate(child: Expression) extends UnaryExpression + with ExpectsInputTypes with NullIntolerant { + override def inputTypes: Seq[AbstractDataType] = Seq(DateType) + + override def dataType: DataType = IntegerType + + override def nullSafeEval(input: Any): Any = input.asInstanceOf[Int] + + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = + defineCodeGen(ctx, ev, c => c) + + override def prettyName: String = "unix_date" +} + abstract class IntegralToTimestampBase extends UnaryExpression with ExpectsInputTypes with NullIntolerant { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala index 8a1a34276341d..79770505ec35d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala @@ -1245,6 +1245,30 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkResult(Int.MinValue.toLong - 100) } + test("DATE_FROM_UNIX_DATE") { + def testIntegralFunc(value: Number): Unit = { + checkEvaluation( + DateFromUnixDate(Literal(value.intValue())), + LocalDate.ofEpochDay(value.intValue())) + } + // test null input + checkEvaluation(DateFromUnixDate(Literal(null, IntegerType)), null) + // test integral input + testIntegralInput(testIntegralFunc) + } + + test("UNIX_DATE") { + def testIntegralFunc(value: Number): Unit = { + checkEvaluation( + UnixDate(Literal(LocalDate.ofEpochDay(value.intValue()))), + value.intValue()) + } + // test null input + checkEvaluation(UnixDate(Literal(null, DateType)), null) + // test various inputs + testIntegralInput(testIntegralFunc) + } + test("UNIX_SECONDS") { checkEvaluation(UnixSeconds(Literal(null, TimestampType)), null) var timestamp = Literal(new Timestamp(0L)) diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 861062a1f7705..a6d041a588a6d 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -1,6 +1,6 @@ ## Summary - - Number of queries: 345 + - Number of queries: 347 - Number of expressions that missing example: 13 - Expressions missing examples: bigint,binary,boolean,date,decimal,double,float,int,smallint,string,timestamp,tinyint,window ## Schema of Built-in Functions @@ -91,6 +91,7 @@ | org.apache.spark.sql.catalyst.expressions.DateAdd | date_add | SELECT date_add('2016-07-30', 1) | struct | | org.apache.spark.sql.catalyst.expressions.DateDiff | datediff | SELECT datediff('2009-07-31', '2009-07-30') | struct | | org.apache.spark.sql.catalyst.expressions.DateFormatClass | date_format | SELECT date_format('2016-04-08', 'y') | struct | +| org.apache.spark.sql.catalyst.expressions.DateFromUnixDate | date_from_unix_date | SELECT date_from_unix_date(1) | struct | | org.apache.spark.sql.catalyst.expressions.DatePart | date_part | SELECT date_part('YEAR', TIMESTAMP '2019-08-12 01:00:00.123456') | struct | | org.apache.spark.sql.catalyst.expressions.DateSub | date_sub | SELECT date_sub('2016-07-30', 1) | struct | | org.apache.spark.sql.catalyst.expressions.DayOfMonth | day | SELECT day('2009-07-30') | struct | @@ -289,6 +290,7 @@ | org.apache.spark.sql.catalyst.expressions.UnaryMinus | negative | SELECT negative(1) | struct | | org.apache.spark.sql.catalyst.expressions.UnaryPositive | positive | SELECT positive(1) | struct<(+ 1):int> | | org.apache.spark.sql.catalyst.expressions.Unhex | unhex | SELECT decode(unhex('537061726B2053514C'), 'UTF-8') | struct | +| org.apache.spark.sql.catalyst.expressions.UnixDate | unix_date | SELECT unix_date(DATE("1970-01-02")) | struct | | org.apache.spark.sql.catalyst.expressions.UnixMicros | unix_micros | SELECT unix_micros(TIMESTAMP('1970-01-01 00:00:01Z')) | struct | | org.apache.spark.sql.catalyst.expressions.UnixMillis | unix_millis | SELECT unix_millis(TIMESTAMP('1970-01-01 00:00:01Z')) | struct | | org.apache.spark.sql.catalyst.expressions.UnixSeconds | unix_seconds | SELECT unix_seconds(TIMESTAMP('1970-01-01 00:00:01Z')) | struct | diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql index c2ccb3ee0db06..e35266a85d46b 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql @@ -18,7 +18,10 @@ select TIMESTAMP_SECONDS(0.1234567d), TIMESTAMP_SECONDS(FLOAT(0.1234567)); select UNIX_SECONDS(TIMESTAMP('2020-12-01 14:30:08Z')), UNIX_SECONDS(TIMESTAMP('2020-12-01 14:30:08.999999Z')), UNIX_SECONDS(null); select UNIX_MILLIS(TIMESTAMP('2020-12-01 14:30:08Z')), UNIX_MILLIS(TIMESTAMP('2020-12-01 14:30:08.999999Z')), UNIX_MILLIS(null); select UNIX_MICROS(TIMESTAMP('2020-12-01 14:30:08Z')), UNIX_MICROS(TIMESTAMP('2020-12-01 14:30:08.999999Z')), UNIX_MICROS(null); - +-- DATE_FROM_UNIX_DATE +select DATE_FROM_UNIX_DATE(0), DATE_FROM_UNIX_DATE(1000), DATE_FROM_UNIX_DATE(null); +-- UNIX_DATE +select UNIX_DATE(DATE('1970-01-01')), UNIX_DATE(DATE('2020-12-04')), UNIX_DATE(null); -- [SPARK-16836] current_date and current_timestamp literals select current_date = current_date(), current_timestamp = current_timestamp(); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out index 9d99d3b870b3f..18a751f573bc2 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 120 +-- Number of queries: 122 -- !query @@ -111,6 +111,22 @@ struct +-- !query output +1970-01-01 1972-09-27 NULL + + +-- !query +select UNIX_DATE(DATE('1970-01-01')), UNIX_DATE(DATE('2020-12-04')), UNIX_DATE(null) +-- !query schema +struct +-- !query output +0 18600 NULL + + -- !query select current_date = current_date(), current_timestamp = current_timestamp() -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out index 73e9823d96a73..be75f6fb994dd 100644 --- a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 120 +-- Number of queries: 122 -- !query @@ -111,6 +111,22 @@ struct +-- !query output +1970-01-01 1972-09-27 NULL + + +-- !query +select UNIX_DATE(DATE('1970-01-01')), UNIX_DATE(DATE('2020-12-04')), UNIX_DATE(null) +-- !query schema +struct +-- !query output +0 18600 NULL + + -- !query select current_date = current_date(), current_timestamp = current_timestamp() -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out index 2c39c1291aa70..1e963ed16fd96 100755 --- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 120 +-- Number of queries: 122 -- !query @@ -111,6 +111,22 @@ struct +-- !query output +1970-01-01 1972-09-27 NULL + + +-- !query +select UNIX_DATE(DATE('1970-01-01')), UNIX_DATE(DATE('2020-12-04')), UNIX_DATE(null) +-- !query schema +struct +-- !query output +0 18600 NULL + + -- !query select current_date = current_date(), current_timestamp = current_timestamp() -- !query schema From 512fb32b38e4694abd9f667581cdd5e99dee811f Mon Sep 17 00:00:00 2001 From: luluorta Date: Thu, 3 Dec 2020 14:58:56 +0000 Subject: [PATCH 090/150] [SPARK-26218][SQL][FOLLOW UP] Fix the corner case of codegen when casting float to Integer ### What changes were proposed in this pull request? This is a followup of [#27151](https://github.com/apache/spark/pull/27151). It fixes the same issue for the codegen path. ### Why are the changes needed? Result corrupt. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Added Unit test. Closes #30585 from luluorta/SPARK-26218. Authored-by: luluorta Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/expressions/Cast.scala | 52 +++++++------------ .../sql/catalyst/expressions/CastSuite.scala | 5 ++ 2 files changed, 24 insertions(+), 33 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 95f09d64c484b..1b2e2db932970 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -1393,25 +1393,19 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit """ } - private[this] def lowerAndUpperBound( - fractionType: String, - integralType: String): (String, String) = { - assert(fractionType == "float" || fractionType == "double") - val typeIndicator = fractionType.charAt(0) - val (min, max) = integralType.toLowerCase(Locale.ROOT) match { - case "long" => (Long.MinValue, Long.MaxValue) - case "int" => (Int.MinValue, Int.MaxValue) - case "short" => (Short.MinValue, Short.MaxValue) - case "byte" => (Byte.MinValue, Byte.MaxValue) + private[this] def lowerAndUpperBound(integralType: String): (String, String) = { + val (min, max, typeIndicator) = integralType.toLowerCase(Locale.ROOT) match { + case "long" => (Long.MinValue, Long.MaxValue, "L") + case "int" => (Int.MinValue, Int.MaxValue, "") + case "short" => (Short.MinValue, Short.MaxValue, "") + case "byte" => (Byte.MinValue, Byte.MaxValue, "") } (min.toString + typeIndicator, max.toString + typeIndicator) } - private[this] def castFractionToIntegralTypeCode( - fractionType: String, - integralType: String): CastFunction = { + private[this] def castFractionToIntegralTypeCode(integralType: String): CastFunction = { assert(ansiEnabled) - val (min, max) = lowerAndUpperBound(fractionType, integralType) + val (min, max) = lowerAndUpperBound(integralType) val mathClass = classOf[Math].getName // When casting floating values to integral types, Spark uses the method `Numeric.toInt` // Or `Numeric.toLong` directly. For positive floating values, it is equivalent to `Math.floor`; @@ -1449,12 +1443,10 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit (c, evPrim, evNull) => code"$evNull = true;" case TimestampType => castTimestampToIntegralTypeCode(ctx, "byte") case DecimalType() => castDecimalToIntegralTypeCode(ctx, "byte") - case _: ShortType | _: IntegerType | _: LongType if ansiEnabled => + case ShortType | IntegerType | LongType if ansiEnabled => castIntegralTypeToIntegralTypeExactCode("byte") - case _: FloatType if ansiEnabled => - castFractionToIntegralTypeCode("float", "byte") - case _: DoubleType if ansiEnabled => - castFractionToIntegralTypeCode("double", "byte") + case FloatType | DoubleType if ansiEnabled => + castFractionToIntegralTypeCode("byte") case x: NumericType => (c, evPrim, evNull) => code"$evPrim = (byte) $c;" } @@ -1482,12 +1474,10 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit (c, evPrim, evNull) => code"$evNull = true;" case TimestampType => castTimestampToIntegralTypeCode(ctx, "short") case DecimalType() => castDecimalToIntegralTypeCode(ctx, "short") - case _: IntegerType | _: LongType if ansiEnabled => + case IntegerType | LongType if ansiEnabled => castIntegralTypeToIntegralTypeExactCode("short") - case _: FloatType if ansiEnabled => - castFractionToIntegralTypeCode("float", "short") - case _: DoubleType if ansiEnabled => - castFractionToIntegralTypeCode("double", "short") + case FloatType | DoubleType if ansiEnabled => + castFractionToIntegralTypeCode("short") case x: NumericType => (c, evPrim, evNull) => code"$evPrim = (short) $c;" } @@ -1513,11 +1503,9 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit (c, evPrim, evNull) => code"$evNull = true;" case TimestampType => castTimestampToIntegralTypeCode(ctx, "int") case DecimalType() => castDecimalToIntegralTypeCode(ctx, "int") - case _: LongType if ansiEnabled => castIntegralTypeToIntegralTypeExactCode("int") - case _: FloatType if ansiEnabled => - castFractionToIntegralTypeCode("float", "int") - case _: DoubleType if ansiEnabled => - castFractionToIntegralTypeCode("double", "int") + case LongType if ansiEnabled => castIntegralTypeToIntegralTypeExactCode("int") + case FloatType | DoubleType if ansiEnabled => + castFractionToIntegralTypeCode("int") case x: NumericType => (c, evPrim, evNull) => code"$evPrim = (int) $c;" } @@ -1544,10 +1532,8 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit case TimestampType => (c, evPrim, evNull) => code"$evPrim = (long) ${timestampToLongCode(c)};" case DecimalType() => castDecimalToIntegralTypeCode(ctx, "long") - case _: FloatType if ansiEnabled => - castFractionToIntegralTypeCode("float", "long") - case _: DoubleType if ansiEnabled => - castFractionToIntegralTypeCode("double", "long") + case FloatType | DoubleType if ansiEnabled => + castFractionToIntegralTypeCode("long") case x: NumericType => (c, evPrim, evNull) => code"$evPrim = (long) $c;" } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala index d284c417042c1..35db25ec9342c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala @@ -975,6 +975,11 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase { } } } + + test("SPARK-26218: Fix the corner case of codegen when casting float to Integer") { + checkExceptionInExpression[ArithmeticException]( + cast(cast(Literal("2147483648"), FloatType), IntegerType), "overflow") + } } /** From 0706e64c49f66431560cdbecb28adcda244c3342 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Thu, 3 Dec 2020 15:24:44 +0000 Subject: [PATCH 091/150] [SPARK-30098][SQL] Add a configuration to use default datasource as provider for CREATE TABLE command ### What changes were proposed in this pull request? For CRETE TABLE [AS SELECT] command, creates native Parquet table if neither USING nor STORE AS is specified and `spark.sql.legacy.createHiveTableByDefault` is false. This is a retry after we unify the CREATE TABLE syntax. It partially reverts https://github.com/apache/spark/commit/d2bec5e265e0aa4fa527c3f43cfe738cdbdc4598 This PR allows `CREATE EXTERNAL TABLE` when `LOCATION` is present. This was not allowed for data source tables before, which is an unnecessary behavior different with hive tables. ### Why are the changes needed? Changing from Hive text table to native Parquet table has many benefits: 1. be consistent with `DataFrameWriter.saveAsTable`. 2. better performance 3. better support for nested types (Hive text table doesn't work well with nested types, e.g. `insert into t values struct(null)` actually inserts a null value not `struct(null)` if `t` is a Hive text table, which leads to wrong result) 4. better interoperability as Parquet is a more popular open file format. ### Does this PR introduce _any_ user-facing change? No by default. If the config is set, the behavior change is described below: Behavior-wise, the change is very small as the native Parquet table is also Hive-compatible. All the Spark DDL commands that works for hive tables also works for native Parquet tables, with two exceptions: `ALTER TABLE SET [SERDE | SERDEPROPERTIES]` and `LOAD DATA`. char/varchar behavior has been taken care by https://github.com/apache/spark/pull/30412, and there is no behavior difference between data source and hive tables. One potential issue is `CREATE TABLE ... LOCATION ...` while users want to directly access the files later. It's more like a corner case and the legacy config should be good enough. Another potential issue is users may use Spark to create the table and then use Hive to add partitions with different serde. This is not allowed for Spark native tables. ### How was this patch tested? Re-enable the tests Closes #30554 from cloud-fan/create-table. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan --- .../apache/spark/sql/internal/SQLConf.scala | 9 +++++ .../analysis/ResolveSessionCatalog.scala | 13 +++++--- .../sql/connector/DataSourceV2SQLSuite.scala | 33 ++++++++++--------- .../command/PlanResolutionSuite.scala | 6 ++-- .../execution/HiveCompatibilitySuite.scala | 4 +++ .../sql/hive/HiveShowCreateTableSuite.scala | 18 +++++++++- .../apache/spark/sql/hive/InsertSuite.scala | 3 +- .../spark/sql/hive/QueryPartitionSuite.scala | 5 +-- .../spark/sql/hive/StatisticsSuite.scala | 27 ++++++++++----- .../spark/sql/hive/client/VersionsSuite.scala | 1 + .../sql/hive/execution/HiveDDLSuite.scala | 2 +- .../sql/hive/execution/HiveSerDeSuite.scala | 5 +-- .../hive/execution/HiveTableScanSuite.scala | 5 ++- .../sql/hive/execution/SQLQuerySuite.scala | 1 + .../apache/spark/sql/hive/test/TestHive.scala | 13 ++++---- 15 files changed, 100 insertions(+), 45 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index a1d6f9f608873..b32476a5af71a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2921,6 +2921,15 @@ object SQLConf { .stringConf .createWithDefault("") + val LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT = + buildConf("spark.sql.legacy.createHiveTableByDefault") + .internal() + .doc("When set to true, CREATE TABLE syntax without USING or STORED AS will use Hive " + + s"instead of the value of ${DEFAULT_DATA_SOURCE_NAME.key} as the table provider.") + .version("3.1.0") + .booleanConf + .createWithDefault(true) + /** * Holds information about keys that have been deprecated. * diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index f6005f4b413a2..f35fcdc07c372 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource} import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 -import org.apache.spark.sql.internal.HiveSerDe +import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} import org.apache.spark.sql.types.{MetadataBuilder, StructField, StructType} /** @@ -636,11 +636,16 @@ class ResolveSessionCatalog( (storageFormat, DDLUtils.HIVE_PROVIDER) } else { // If neither USING nor STORED AS/ROW FORMAT is specified, we create native data source - // tables if it's a CTAS and `conf.convertCTAS` is true. - // TODO: create native data source table by default for non-CTAS. - if (ctas && conf.convertCTAS) { + // tables if: + // 1. `LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT` is false, or + // 2. It's a CTAS and `conf.convertCTAS` is true. + val createHiveTableByDefault = conf.getConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT) + if (!createHiveTableByDefault || (ctas && conf.convertCTAS)) { (nonHiveStorageFormat, conf.defaultDataSourceName) } else { + logWarning("A Hive serde table will be created as there is no table provider " + + s"specified. You can set ${SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT.key} to false " + + "so that native data source table will be created instead.") (defaultHiveStorage, DDLUtils.HIVE_PROVIDER) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index 583bc694dc3be..7635590ab462e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -266,22 +266,23 @@ class DataSourceV2SQLSuite checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), Seq.empty) } - // TODO: ignored by SPARK-31707, restore the test after create table syntax unification - ignore("CreateTable: without USING clause") { - // unset this config to use the default v2 session catalog. - spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) - val testCatalog = catalog("testcat").asTableCatalog - - sql("CREATE TABLE testcat.t1 (id int)") - val t1 = testCatalog.loadTable(Identifier.of(Array(), "t1")) - // Spark shouldn't set the default provider for catalog plugins. - assert(!t1.properties.containsKey(TableCatalog.PROP_PROVIDER)) - - sql("CREATE TABLE t2 (id int)") - val t2 = spark.sessionState.catalogManager.v2SessionCatalog.asTableCatalog - .loadTable(Identifier.of(Array("default"), "t2")).asInstanceOf[V1Table] - // Spark should set the default provider as DEFAULT_DATA_SOURCE_NAME for the session catalog. - assert(t2.v1Table.provider == Some(conf.defaultDataSourceName)) + test("CreateTable: without USING clause") { + withSQLConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT.key -> "false") { + // unset this config to use the default v2 session catalog. + spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + val testCatalog = catalog("testcat").asTableCatalog + + sql("CREATE TABLE testcat.t1 (id int)") + val t1 = testCatalog.loadTable(Identifier.of(Array(), "t1")) + // Spark shouldn't set the default provider for catalog plugins. + assert(!t1.properties.containsKey(TableCatalog.PROP_PROVIDER)) + + sql("CREATE TABLE t2 (id int)") + val t2 = spark.sessionState.catalogManager.v2SessionCatalog.asTableCatalog + .loadTable(Identifier.of(Array("default"), "t2")).asInstanceOf[V1Table] + // Spark should set the default provider as DEFAULT_DATA_SOURCE_NAME for the session catalog. + assert(t2.v1Table.provider == Some(conf.defaultDataSourceName)) + } } test("CreateTable/RepalceTable: invalid schema if has interval type") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala index 20cad721d3d0e..33515ad41e918 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala @@ -1588,7 +1588,7 @@ class PlanResolutionSuite extends AnalysisTest { .add("b", StringType) ) ) - compare("CREATE TABLE my_tab(a INT COMMENT 'test', b STRING) " + + compare("CREATE TABLE my_tab(a INT COMMENT 'test', b STRING) STORED AS textfile " + "PARTITIONED BY (c INT, d STRING COMMENT 'test2')", createTable( table = "my_tab", @@ -1616,7 +1616,7 @@ class PlanResolutionSuite extends AnalysisTest { ) // Partitioned by a StructType should be accepted by `SparkSqlParser` but will fail an analyze // rule in `AnalyzeCreateTable`. - compare("CREATE TABLE my_tab(a INT COMMENT 'test', b STRING) " + + compare("CREATE TABLE my_tab(a INT COMMENT 'test', b STRING) STORED AS textfile " + "PARTITIONED BY (nested STRUCT)", createTable( table = "my_tab", @@ -1890,7 +1890,7 @@ class PlanResolutionSuite extends AnalysisTest { } test("Test CTAS #3") { - val s3 = """CREATE TABLE page_view AS SELECT * FROM src""" + val s3 = """CREATE TABLE page_view STORED AS textfile AS SELECT * FROM src""" val (desc, exists) = extractTableDesc(s3) assert(exists == false) assert(desc.identifier.database == Some("default")) diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index 462206d8c546f..4ce1964a19bd9 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -40,6 +40,8 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { private val originalInMemoryPartitionPruning = TestHive.conf.inMemoryPartitionPruning private val originalCrossJoinEnabled = TestHive.conf.crossJoinEnabled private val originalSessionLocalTimeZone = TestHive.conf.sessionLocalTimeZone + private val originalCreateHiveTable = + TestHive.conf.getConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT) def testCases: Seq[(String, File)] = { hiveQueryDir.listFiles.map(f => f.getName.stripSuffix(".q") -> f) @@ -59,6 +61,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { // Fix session local timezone to America/Los_Angeles for those timezone sensitive tests // (timestamp_*) TestHive.setConf(SQLConf.SESSION_LOCAL_TIMEZONE, "America/Los_Angeles") + TestHive.setConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT, true) RuleExecutor.resetMetrics() } @@ -69,6 +72,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning) TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, originalCrossJoinEnabled) TestHive.setConf(SQLConf.SESSION_LOCAL_TIMEZONE, originalSessionLocalTimeZone) + TestHive.setConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT, originalCreateHiveTable) // For debugging dump some statistics about how much time was spent in various optimizer rules logWarning(RuleExecutor.dumpTimeSpent()) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala index 3e7c3e6799724..2fb67c793dc6a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala @@ -21,10 +21,26 @@ import org.apache.spark.sql.{AnalysisException, ShowCreateTableSuite} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.hive.test.TestHiveSingleton -import org.apache.spark.sql.internal.HiveSerDe +import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSingleton { + private var origCreateHiveTableConfig = false + + protected override def beforeAll(): Unit = { + super.beforeAll() + origCreateHiveTableConfig = + spark.conf.get(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT) + spark.conf.set(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT.key, true) + } + + protected override def afterAll(): Unit = { + spark.conf.set( + SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT.key, + origCreateHiveTableConfig) + super.afterAll() + } + test("view") { Seq(true, false).foreach { serde => withView("v1") { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala index ebc6cfb77d355..71750e6b3a516 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala @@ -277,7 +277,8 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter test("Test partition mode = strict") { withSQLConf(("hive.exec.dynamic.partition.mode", "strict")) { withTable("partitioned") { - sql("CREATE TABLE partitioned (id bigint, data string) PARTITIONED BY (part string)") + sql("CREATE TABLE partitioned (id bigint, data string) USING hive " + + "PARTITIONED BY (part string)") val data = (1 to 10).map(i => (i, s"data-$i", if ((i % 2) == 0) "even" else "odd")) .toDF("id", "data", "part") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala index 483622b16762a..cec6ec1ee1275 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala @@ -38,7 +38,7 @@ class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingl testData.createOrReplaceTempView("testData") // create the table for test - sql(s"CREATE TABLE table_with_partition(key int,value string) " + + sql(s"CREATE TABLE table_with_partition(key int,value string) USING hive " + s"PARTITIONED by (ds string) location '${tmpDir.toURI}' ") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') " + "SELECT key,value FROM testData") @@ -81,7 +81,8 @@ class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingl test("SPARK-21739: Cast expression should initialize timezoneId") { withTable("table_with_timestamp_partition") { - sql("CREATE TABLE table_with_timestamp_partition(value int) PARTITIONED BY (ts TIMESTAMP)") + sql("CREATE TABLE table_with_timestamp_partition(value int) USING hive " + + "PARTITIONED BY (ts TIMESTAMP)") sql("INSERT OVERWRITE TABLE table_with_timestamp_partition " + "PARTITION (ts = '2010-01-01 00:00:00.000') VALUES (1)") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index 43d1ba04c561d..2ea98943011f4 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -165,7 +165,8 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto // Partitioned table val partTable = "part_table" withTable(partTable) { - sql(s"CREATE TABLE $partTable (key STRING, value STRING) PARTITIONED BY (ds STRING)") + sql(s"CREATE TABLE $partTable (key STRING, value STRING) USING hive " + + "PARTITIONED BY (ds STRING)") sql(s"INSERT INTO TABLE $partTable PARTITION (ds='2010-01-01') SELECT * FROM src") sql(s"INSERT INTO TABLE $partTable PARTITION (ds='2010-01-02') SELECT * FROM src") sql(s"INSERT INTO TABLE $partTable PARTITION (ds='2010-01-03') SELECT * FROM src") @@ -191,7 +192,8 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto SQLConf.PARALLEL_FILE_LISTING_IN_STATS_COMPUTATION.key -> "True") { val checkSizeTable = "checkSizeTable" withTable(checkSizeTable) { - sql(s"CREATE TABLE $checkSizeTable (key STRING, value STRING) PARTITIONED BY (ds STRING)") + sql(s"CREATE TABLE $checkSizeTable (key STRING, value STRING) USING hive " + + "PARTITIONED BY (ds STRING)") sql(s"INSERT INTO TABLE $checkSizeTable PARTITION (ds='2010-01-01') SELECT * FROM src") sql(s"INSERT INTO TABLE $checkSizeTable PARTITION (ds='2010-01-02') SELECT * FROM src") sql(s"INSERT INTO TABLE $checkSizeTable PARTITION (ds='2010-01-03') SELECT * FROM src") @@ -274,7 +276,8 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto test("SPARK-22745 - read Hive's statistics for partition") { val tableName = "hive_stats_part_table" withTable(tableName) { - sql(s"CREATE TABLE $tableName (key STRING, value STRING) PARTITIONED BY (ds STRING)") + sql(s"CREATE TABLE $tableName (key STRING, value STRING) USING hive " + + "PARTITIONED BY (ds STRING)") sql(s"INSERT INTO TABLE $tableName PARTITION (ds='2017-01-01') SELECT * FROM src") var partition = spark.sessionState.catalog .getPartition(TableIdentifier(tableName), Map("ds" -> "2017-01-01")) @@ -296,7 +299,8 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto val tableName = "analyzeTable_part" withTable(tableName) { withTempPath { path => - sql(s"CREATE TABLE $tableName (key STRING, value STRING) PARTITIONED BY (ds STRING)") + sql(s"CREATE TABLE $tableName (key STRING, value STRING) USING hive " + + "PARTITIONED BY (ds STRING)") val partitionDates = List("2010-01-01", "2010-01-02", "2010-01-03") partitionDates.foreach { ds => @@ -321,6 +325,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto sql( s""" |CREATE TABLE $sourceTableName (key STRING, value STRING) + |USING hive |PARTITIONED BY (ds STRING) |LOCATION '${path.toURI}' """.stripMargin) @@ -338,6 +343,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto sql( s""" |CREATE TABLE $tableName (key STRING, value STRING) + |USING hive |PARTITIONED BY (ds STRING) |LOCATION '${path.toURI}' """.stripMargin) @@ -371,7 +377,8 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto } withTable(tableName) { - sql(s"CREATE TABLE $tableName (key STRING, value STRING) PARTITIONED BY (ds STRING)") + sql(s"CREATE TABLE $tableName (key STRING, value STRING) USING hive " + + "PARTITIONED BY (ds STRING)") createPartition("2010-01-01", "SELECT '1', 'A' from src") createPartition("2010-01-02", "SELECT '1', 'A' from src UNION ALL SELECT '1', 'A' from src") @@ -424,7 +431,8 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto } withTable(tableName) { - sql(s"CREATE TABLE $tableName (key STRING, value STRING) PARTITIONED BY (ds STRING, hr INT)") + sql(s"CREATE TABLE $tableName (key STRING, value STRING) USING hive " + + "PARTITIONED BY (ds STRING, hr INT)") createPartition("2010-01-01", 10, "SELECT '1', 'A' from src") createPartition("2010-01-01", 11, "SELECT '1', 'A' from src") @@ -472,7 +480,8 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto } withTable(tableName) { - sql(s"CREATE TABLE $tableName (key STRING, value STRING) PARTITIONED BY (ds STRING, hr INT)") + sql(s"CREATE TABLE $tableName (key STRING, value STRING) USING hive " + + "PARTITIONED BY (ds STRING, hr INT)") createPartition("2010-01-01", 10, "SELECT '1', 'A' from src") createPartition("2010-01-01", 11, "SELECT '1', 'A' from src") @@ -961,7 +970,8 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto Seq(false, true).foreach { autoUpdate => withSQLConf(SQLConf.AUTO_SIZE_UPDATE_ENABLED.key -> autoUpdate.toString) { withTable(table) { - sql(s"CREATE TABLE $table (i INT, j STRING) PARTITIONED BY (ds STRING, hr STRING)") + sql(s"CREATE TABLE $table (i INT, j STRING) USING hive " + + "PARTITIONED BY (ds STRING, hr STRING)") // table has two partitions initially for (ds <- Seq("2008-04-08"); hr <- Seq("11", "12")) { sql(s"INSERT OVERWRITE TABLE $table PARTITION (ds='$ds',hr='$hr') SELECT 1, 'a'") @@ -1034,6 +1044,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto sql( s""" |CREATE TABLE $managedTable (key INT, value STRING) + |USING hive |PARTITIONED BY (ds STRING, hr STRING) """.stripMargin) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala index d9ba6dd80e4ef..684529aa330a7 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala @@ -798,6 +798,7 @@ class VersionsSuite extends SparkFunSuite with Logging { versionSpark.sql( """ |CREATE TABLE tbl(c1 string) + |USING hive |PARTITIONED BY (ds STRING) """.stripMargin) versionSpark.sql("INSERT OVERWRITE TABLE tbl partition (ds='2') SELECT '1'") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 2dfb8bb552594..ce31e39985971 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -983,7 +983,7 @@ class HiveDDLSuite } test("alter table partition - storage information") { - sql("CREATE TABLE boxes (height INT, length INT) PARTITIONED BY (width INT)") + sql("CREATE TABLE boxes (height INT, length INT) STORED AS textfile PARTITIONED BY (width INT)") sql("INSERT OVERWRITE TABLE boxes PARTITION (width=4) SELECT 4, 4") val catalog = spark.sessionState.catalog val expectedSerde = "com.sparkbricks.serde.ColumnarSerDe" diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala index f723c9f80c2ab..d7129bcb37e69 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala @@ -88,7 +88,7 @@ class HiveSerDeSuite extends HiveComparisonTest with PlanTest with BeforeAndAfte test("Test the default fileformat for Hive-serde tables") { withSQLConf("hive.default.fileformat" -> "orc") { val (desc, exists) = extractTableDesc( - "CREATE TABLE IF NOT EXISTS fileformat_test (id int)") + "CREATE TABLE IF NOT EXISTS fileformat_test (id int) USING hive") assert(exists) assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat")) assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat")) @@ -96,7 +96,8 @@ class HiveSerDeSuite extends HiveComparisonTest with PlanTest with BeforeAndAfte } withSQLConf("hive.default.fileformat" -> "parquet") { - val (desc, exists) = extractTableDesc("CREATE TABLE IF NOT EXISTS fileformat_test (id int)") + val (desc, exists) = extractTableDesc( + "CREATE TABLE IF NOT EXISTS fileformat_test (id int) USING hive") assert(exists) val input = desc.storage.inputFormat val output = desc.storage.outputFormat diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala index 4a50621d89d4e..5b43f82f253ea 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala @@ -113,6 +113,7 @@ class HiveTableScanSuite extends HiveComparisonTest with SQLTestUtils with TestH sql( s""" |CREATE TABLE $table(id string) + |USING hive |PARTITIONED BY (p1 string,p2 string,p3 string,p4 string,p5 string) """.stripMargin) sql( @@ -157,6 +158,7 @@ class HiveTableScanSuite extends HiveComparisonTest with SQLTestUtils with TestH sql( s""" |CREATE TABLE $table(id string) + |USING hive |PARTITIONED BY (p1 string,p2 string,p3 string,p4 string,p5 string) """.stripMargin) sql( @@ -182,6 +184,7 @@ class HiveTableScanSuite extends HiveComparisonTest with SQLTestUtils with TestH sql( s""" |CREATE TABLE $table (id int) + |USING hive |PARTITIONED BY (a int, b int) """.stripMargin) val scan1 = getHiveTableScanExec(s"SELECT * FROM $table WHERE a = 1 AND b = 2") @@ -252,7 +255,7 @@ class HiveTableScanSuite extends HiveComparisonTest with SQLTestUtils with TestH test("SPARK-32069: Improve error message on reading unexpected directory") { withTable("t") { withTempDir { f => - sql(s"CREATE TABLE t(i LONG) LOCATION '${f.getAbsolutePath}'") + sql(s"CREATE TABLE t(i LONG) USING hive LOCATION '${f.getAbsolutePath}'") sql("INSERT INTO t VALUES(1)") val dir = new File(f.getCanonicalPath + "/data") dir.mkdir() diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 79b3c3efe531c..6b82b1267bc66 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -2026,6 +2026,7 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi sql( """ |CREATE TABLE part_table (c STRING) + |STORED AS textfile |PARTITIONED BY (d STRING) """.stripMargin) sql(s"LOAD DATA LOCAL INPATH '$path/part-r-000011' " + diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala index a25c61c96f3d8..e996f2c6ec78f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala @@ -327,20 +327,22 @@ private[hive] class TestHiveSparkSession( } if (loadTestTables) { + def createTableSQL(tblName: String): String = { + s"CREATE TABLE $tblName (key INT, value STRING) STORED AS textfile" + } // The test tables that are defined in the Hive QTestUtil. // /itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java // https://github.com/apache/hive/blob/branch-0.13/data/scripts/q_test_init.sql @transient val hiveQTestUtilTables: Seq[TestTable] = Seq( TestTable("src", - "CREATE TABLE src (key INT, value STRING) STORED AS TEXTFILE".cmd, + createTableSQL("src").cmd, s"LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv1.txt")}' INTO TABLE src".cmd), TestTable("src1", - "CREATE TABLE src1 (key INT, value STRING) STORED AS TEXTFILE".cmd, + createTableSQL("src1").cmd, s"LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv3.txt")}' INTO TABLE src1".cmd), TestTable("srcpart", () => { - "CREATE TABLE srcpart (key INT, value STRING) PARTITIONED BY (ds STRING, hr STRING)" - .cmd.apply() + s"${createTableSQL("srcpart")} PARTITIONED BY (ds STRING, hr STRING)".cmd.apply() for (ds <- Seq("2008-04-08", "2008-04-09"); hr <- Seq("11", "12")) { s""" |LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv1.txt")}' @@ -349,8 +351,7 @@ private[hive] class TestHiveSparkSession( } }), TestTable("srcpart1", () => { - "CREATE TABLE srcpart1 (key INT, value STRING) PARTITIONED BY (ds STRING, hr INT)" - .cmd.apply() + s"${createTableSQL("srcpart1")} PARTITIONED BY (ds STRING, hr INT)".cmd.apply() for (ds <- Seq("2008-04-08", "2008-04-09"); hr <- 11 to 12) { s""" |LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv1.txt")}' From bd711863fdcdde21a7d64de8a9b6b7a8bf7c19ec Mon Sep 17 00:00:00 2001 From: Gabor Somogyi Date: Fri, 4 Dec 2020 01:37:44 +0900 Subject: [PATCH 092/150] [SPARK-33629][PYTHON] Make spark.buffer.size configuration visible on driver side ### What changes were proposed in this pull request? `spark.buffer.size` not applied in driver from pyspark. In this PR I've fixed this issue. ### Why are the changes needed? Apply the mentioned config on driver side. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing unit tests + manually. Added the following code temporarily: ``` def local_connect_and_auth(port, auth_secret): ... sock.connect(sa) print("SPARK_BUFFER_SIZE: %d" % int(os.environ.get("SPARK_BUFFER_SIZE", 65536))) <- This is the addition sockfile = sock.makefile("rwb", int(os.environ.get("SPARK_BUFFER_SIZE", 65536))) ... ``` Test: ``` #Compile Spark echo "spark.buffer.size 10000" >> conf/spark-defaults.conf $ ./bin/pyspark Python 3.8.5 (default, Jul 21 2020, 10:48:26) [Clang 11.0.3 (clang-1103.0.32.62)] on darwin Type "help", "copyright", "credits" or "license" for more information. 20/12/03 13:38:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 20/12/03 13:38:14 WARN SparkEnv: I/O encryption enabled without RPC encryption: keys will be visible on the wire. Welcome to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ /__ / .__/\_,_/_/ /_/\_\ version 3.1.0-SNAPSHOT /_/ Using Python version 3.8.5 (default, Jul 21 2020 10:48:26) Spark context Web UI available at http://192.168.0.189:4040 Spark context available as 'sc' (master = local[*], app id = local-1606999094506). SparkSession available as 'spark'. >>> sc.setLogLevel("TRACE") >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect() ... SPARK_BUFFER_SIZE: 10000 ... [[0], [2], [3], [4], [6]] >>> ``` Closes #30592 from gaborgsomogyi/SPARK-33629. Authored-by: Gabor Somogyi Signed-off-by: HyukjinKwon --- .../main/scala/org/apache/spark/api/python/PythonUtils.scala | 4 ++++ python/pyspark/context.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala index 33849f6fcb65f..2f47d28f09103 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala @@ -89,4 +89,8 @@ private[spark] object PythonUtils { def getPythonAuthSocketTimeout(sc: JavaSparkContext): Long = { sc.conf.get(org.apache.spark.internal.config.Python.PYTHON_AUTH_SOCKET_TIMEOUT) } + + def getSparkBufferSize(sc: JavaSparkContext): Int = { + sc.conf.get(org.apache.spark.internal.config.BUFFER_SIZE) + } } diff --git a/python/pyspark/context.py b/python/pyspark/context.py index 1c542fa897ece..3da535b026137 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -224,6 +224,8 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, self._encryption_enabled = self._jvm.PythonUtils.isEncryptionEnabled(self._jsc) os.environ["SPARK_AUTH_SOCKET_TIMEOUT"] = \ str(self._jvm.PythonUtils.getPythonAuthSocketTimeout(self._jsc)) + os.environ["SPARK_BUFFER_SIZE"] = \ + str(self._jvm.PythonUtils.getSparkBufferSize(self._jsc)) self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python') self.pythonVer = "%d.%d" % sys.version_info[:2] From aa13e207c9091e24aae1edcf3bb5cd35d3a27cbb Mon Sep 17 00:00:00 2001 From: Anton Okolnychyi Date: Thu, 3 Dec 2020 09:12:30 -0800 Subject: [PATCH 093/150] [SPARK-33623][SQL] Add canDeleteWhere to SupportsDelete ### What changes were proposed in this pull request? This PR provides us with a way to check if a data source is going to reject the delete via `deleteWhere` at planning time. ### Why are the changes needed? The only way to support delete statements right now is to implement ``SupportsDelete``. According to its Javadoc, that interface is meant for cases when we can delete data without much effort (e.g. like deleting a complete partition in a Hive table). This PR actually provides us with a way to check if a data source is going to reject the delete via `deleteWhere` at planning time instead of just getting an exception during execution. In the future, we can use this functionality to decide whether Spark should rewrite this delete and execute a distributed query or it can just pass a set of filters. Consider an example of a partitioned Hive table. If we have a delete predicate like `part_col = '2020'`, we can just drop the matching partition to satisfy this delete. In this case, the data source should return `true` from `canDeleteWhere` and use the filters it accepts in `deleteWhere` to drop the partition. I consider this as a delete without significant effort. At the same time, if we have a delete predicate like `id = 10`, Hive tables would not be able to execute this delete using a metadata only operation without rewriting files. In that case, the data source should return `false` from `canDeleteWhere` and we should use a more sophisticated row-level API to find out which records should be removed (the API is yet to be discussed, but we need this PR as a basis). If we decide to support subqueries and all delete use cases by simply extending the existing API, this will mean all data sources will have to implement a lot of Spark logic to determine which records changed. I don't think we want to go that way as the Spark logic to determine which records should be deleted is independent of the underlying data source. So the assumption is that Spark will execute a plan to find which records must be deleted for data sources that return `false` from `canDeleteWhere`. ### Does this PR introduce _any_ user-facing change? Yes but it is backward compatible. ### How was this patch tested? This PR comes with a new test. Closes #30562 from aokolnychyi/spark-33623. Authored-by: Anton Okolnychyi Signed-off-by: Dongjoon Hyun --- .../sql/connector/catalog/SupportsDelete.java | 24 ++++++++++++++++++- .../spark/sql/connector/InMemoryTable.scala | 12 ++++++++++ .../datasources/v2/DataSourceV2Strategy.scala | 6 +++++ .../sql/connector/DataSourceV2SQLSuite.scala | 14 +++++++++++ 4 files changed, 55 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsDelete.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsDelete.java index 106f3283a62c8..261e5344be7b9 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsDelete.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsDelete.java @@ -28,8 +28,30 @@ */ @Evolving public interface SupportsDelete { + + /** + * Checks whether it is possible to delete data from a data source table that matches filter + * expressions. + *

      + * Rows should be deleted from the data source iff all of the filter expressions match. + * That is, the expressions must be interpreted as a set of filters that are ANDed together. + *

      + * Spark will call this method at planning time to check whether {@link #deleteWhere(Filter[])} + * would reject the delete operation because it requires significant effort. If this method + * returns false, Spark will not call {@link #deleteWhere(Filter[])} and will try to rewrite + * the delete operation and produce row-level changes if the data source table supports deleting + * individual records. + * + * @param filters filter expressions, used to select rows to delete when all expressions match + * @return true if the delete operation can be performed + */ + default boolean canDeleteWhere(Filter[] filters) { + return true; + } + /** - * Delete data from a data source table that matches filter expressions. + * Delete data from a data source table that matches filter expressions. Note that this method + * will be invoked only if {@link #canDeleteWhere(Filter[])} returns true. *

      * Rows are deleted from the data source iff all of the filter expressions match. That is, the * expressions must be interpreted as a set of filters that are ANDed together. diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala index cfb044b428e41..c4c5835d9d1f5 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala @@ -335,6 +335,10 @@ class InMemoryTable( } } + override def canDeleteWhere(filters: Array[Filter]): Boolean = { + InMemoryTable.supportsFilters(filters) + } + override def deleteWhere(filters: Array[Filter]): Unit = dataMap.synchronized { import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.MultipartIdentifierHelper dataMap --= InMemoryTable.filtersToKeys(dataMap.keys, partCols.map(_.toSeq.quoted), filters) @@ -360,6 +364,14 @@ object InMemoryTable { } } + def supportsFilters(filters: Array[Filter]): Boolean = { + filters.flatMap(splitAnd).forall { + case _: EqualTo => true + case _: IsNotNull => true + case _ => false + } + } + private def extractValue( attr: String, partFieldNames: Seq[String], diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index 0c7bc19ad054e..938ba77fede47 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -221,6 +221,12 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat throw new AnalysisException(s"Exec update failed:" + s" cannot translate expression to source filter: $f")) }).toArray + + if (!table.asDeletable.canDeleteWhere(filters)) { + throw new AnalysisException( + s"Cannot delete from table ${table.name} where ${filters.mkString("[", ", ", "]")}") + } + DeleteFromTableExec(table.asDeletable, filters) :: Nil case _ => throw new AnalysisException("DELETE is only supported with v2 tables.") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index 7635590ab462e..6ef4fd1372a78 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -1812,6 +1812,20 @@ class DataSourceV2SQLSuite } } + test("DeleteFrom: delete with unsupported predicates") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + sql(s"CREATE TABLE $t (id bigint, data string, p int) USING foo") + sql(s"INSERT INTO $t VALUES (2L, 'a', 2), (2L, 'b', 3), (3L, 'c', 3)") + val exc = intercept[AnalysisException] { + sql(s"DELETE FROM $t WHERE id > 3 AND p > 3") + } + + assert(spark.table(t).count === 3) + assert(exc.getMessage.contains(s"Cannot delete from table $t")) + } + } + test("DeleteFrom: DELETE is only supported with v2 tables") { // unset this config to use the default v2 session catalog. spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) From 63f9d474b9ec4b66741fcca1d3c3865c32936a85 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Thu, 3 Dec 2020 09:22:53 -0800 Subject: [PATCH 094/150] [SPARK-33634][SQL][TESTS] Use Analyzer in PlanResolutionSuite ### What changes were proposed in this pull request? Instead of using several analyzer rules, this PR uses the actual analyzer to run tests in `PlanResolutionSuite`. ### Why are the changes needed? Make the test suite to match reality. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? test-only Closes #30574 from cloud-fan/test. Authored-by: Wenchen Fan Signed-off-by: Dongjoon Hyun --- .../command/PlanResolutionSuite.scala | 35 ++++++++----------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala index 33515ad41e918..9b7222da55368 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala @@ -26,14 +26,16 @@ import org.mockito.invocation.InvocationOnMock import org.apache.spark.sql.{AnalysisException, SaveMode} import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} -import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, Analyzer, CTESubstitution, EmptyFunctionRegistry, NoSuchTableException, ResolveCatalogs, ResolvedTable, ResolveInlineTables, ResolveSessionCatalog, UnresolvedAttribute, UnresolvedRelation, UnresolvedSubqueryColumnAliases, UnresolvedV2Relation} +import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, Analyzer, EmptyFunctionRegistry, NoSuchTableException, ResolvedTable, ResolveSessionCatalog, UnresolvedAttribute, UnresolvedRelation, UnresolvedSubqueryColumnAliases, UnresolvedV2Relation} import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType, InMemoryCatalog, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Expression, InSubquery, IntegerLiteral, ListQuery, StringLiteral} import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException} -import org.apache.spark.sql.catalyst.plans.logical.{AlterTable, Assignment, CreateTableAsSelect, CreateTableStatement, CreateV2Table, DeleteAction, DeleteFromTable, DescribeRelation, DropTable, InsertAction, InsertIntoStatement, LocalRelation, LogicalPlan, MergeIntoTable, OneRowRelation, Project, ShowTableProperties, SubqueryAlias, UpdateAction, UpdateTable} +import org.apache.spark.sql.catalyst.plans.logical.{AlterTable, AppendData, Assignment, CreateTableAsSelect, CreateTableStatement, CreateV2Table, DeleteAction, DeleteFromTable, DescribeRelation, DropTable, InsertAction, LocalRelation, LogicalPlan, MergeIntoTable, OneRowRelation, Project, ShowTableProperties, SubqueryAlias, UpdateAction, UpdateTable} +import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.connector.FakeV2Provider import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogNotFoundException, Identifier, Table, TableCapability, TableCatalog, TableChange, V1Table} import org.apache.spark.sql.connector.catalog.TableChange.{UpdateColumnComment, UpdateColumnType} +import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.execution.datasources.CreateTable import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} @@ -49,6 +51,7 @@ class PlanResolutionSuite extends AnalysisTest { private val table: Table = { val t = mock(classOf[Table]) when(t.schema()).thenReturn(new StructType().add("i", "int").add("s", "string")) + when(t.partitioning()).thenReturn(Array.empty[Transform]) t } @@ -151,22 +154,12 @@ class PlanResolutionSuite extends AnalysisTest { } else { catalogManagerWithoutDefault } - val analyzer = new Analyzer(catalogManager) - // TODO: run the analyzer directly. - val rules = Seq( - CTESubstitution, - ResolveInlineTables, - analyzer.ResolveRelations, - new ResolveCatalogs(catalogManager), - new ResolveSessionCatalog(catalogManager, _ == Seq("v"), _ => false), - analyzer.ResolveTables, - analyzer.ResolveReferences, - analyzer.ResolveSubqueryColumnAliases, - analyzer.ResolveReferences, - analyzer.ResolveAlterTableChanges) - rules.foldLeft(parsePlan(query)) { - case (plan, rule) => rule.apply(plan) + val analyzer = new Analyzer(catalogManager) { + override val extendedResolutionRules: Seq[Rule[LogicalPlan]] = Seq( + new ResolveSessionCatalog(catalogManager, _ == Seq("v"), _ => false)) } + // We don't check analysis here, as we expect the plan to be unresolved such as `CreateTable`. + analyzer.execute(CatalystSqlParser.parsePlan(query)) } private def parseResolveCompare(query: String, expected: LogicalPlan): Unit = @@ -1156,9 +1149,9 @@ class PlanResolutionSuite extends AnalysisTest { ("ALTER TABLE testcat.tab ALTER COLUMN i TYPE bigint", false), ("ALTER TABLE tab ALTER COLUMN i TYPE bigint", false), (s"ALTER TABLE $v2SessionCatalogTable ALTER COLUMN i TYPE bigint", true), - ("INSERT INTO TABLE tab VALUES (1)", false), - ("INSERT INTO TABLE testcat.tab VALUES (1)", false), - (s"INSERT INTO TABLE $v2SessionCatalogTable VALUES (1)", true), + ("INSERT INTO TABLE tab VALUES (1, 'a')", false), + ("INSERT INTO TABLE testcat.tab VALUES (1, 'a')", false), + (s"INSERT INTO TABLE $v2SessionCatalogTable VALUES (1, 'a')", true), ("DESC TABLE tab", false), ("DESC TABLE testcat.tab", false), (s"DESC TABLE $v2SessionCatalogTable", true), @@ -1183,7 +1176,7 @@ class PlanResolutionSuite extends AnalysisTest { case Project(_, AsDataSourceV2Relation(r)) => assert(r.catalog.exists(_ == catlogIdent)) assert(r.identifier.exists(_.name() == tableIdent)) - case InsertIntoStatement(r: DataSourceV2Relation, _, _, _, _, _) => + case AppendData(r: DataSourceV2Relation, _, _, _) => assert(r.catalog.exists(_ == catlogIdent)) assert(r.identifier.exists(_.name() == tableIdent)) case DescribeRelation(r: ResolvedTable, _, _) => From 7e759b2d95eb3592d62ec010297c39384173a93c Mon Sep 17 00:00:00 2001 From: Weichen Xu Date: Fri, 4 Dec 2020 08:35:50 +0800 Subject: [PATCH 095/150] [SPARK-33520][ML][PYSPARK] make CrossValidator/TrainValidateSplit/OneVsRest Reader/Writer support Python backend estimator/evaluator ### What changes were proposed in this pull request? make CrossValidator/TrainValidateSplit/OneVsRest Reader/Writer support Python backend estimator/model ### Why are the changes needed? Currently, pyspark support third-party library to define python backend estimator/evaluator, i.e., estimator that inherit `Estimator` instead of `JavaEstimator`, and only can be used in pyspark. CrossValidator and TrainValidateSplit support tuning these python backend estimator, but cannot support saving/load, becase CrossValidator and TrainValidateSplit writer implementation is use JavaMLWriter, which require to convert nested estimator and evaluator into java instance. OneVsRest saving/load now only support java backend classifier due to similar issue. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Unit test. Closes #30471 from WeichenXu123/support_pyio_tuning. Authored-by: Weichen Xu Signed-off-by: Weichen Xu --- python/pyspark/ml/classification.py | 128 ++++++- python/pyspark/ml/classification.pyi | 31 +- python/pyspark/ml/tests/test_persistence.py | 14 +- python/pyspark/ml/tests/test_tuning.py | 97 ++++-- python/pyspark/ml/tuning.py | 357 +++++++++++++++++++- python/pyspark/ml/tuning.pyi | 40 +++ python/pyspark/ml/util.py | 42 ++- python/pyspark/ml/util.pyi | 2 + python/pyspark/testing/mlutils.py | 87 +++++ 9 files changed, 739 insertions(+), 59 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 763038ede876a..0553a61c6c771 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -15,6 +15,7 @@ # limitations under the License. # +import os import operator import sys import uuid @@ -33,7 +34,9 @@ _HasVarianceImpurity, _TreeClassifierParams from pyspark.ml.regression import _FactorizationMachinesParams, DecisionTreeRegressionModel from pyspark.ml.base import _PredictorParams -from pyspark.ml.util import JavaMLWritable, JavaMLReadable, HasTrainingSummary +from pyspark.ml.util import DefaultParamsReader, DefaultParamsWriter, \ + JavaMLReadable, JavaMLReader, JavaMLWritable, JavaMLWriter, \ + MLReader, MLReadable, MLWriter, MLWritable, HasTrainingSummary from pyspark.ml.wrapper import JavaParams, \ JavaPredictor, JavaPredictionModel, JavaWrapper from pyspark.ml.common import inherit_doc @@ -2760,7 +2763,7 @@ def getClassifier(self): @inherit_doc -class OneVsRest(Estimator, _OneVsRestParams, HasParallelism, JavaMLReadable, JavaMLWritable): +class OneVsRest(Estimator, _OneVsRestParams, HasParallelism, MLReadable, MLWritable): """ Reduction of Multiclass Classification to Binary Classification. Performs reduction using one against all strategy. @@ -2991,8 +2994,73 @@ def _to_java(self): _java_obj.setRawPredictionCol(self.getRawPredictionCol()) return _java_obj + @classmethod + def read(cls): + return OneVsRestReader(cls) + + def write(self): + if isinstance(self.getClassifier(), JavaMLWritable): + return JavaMLWriter(self) + else: + return OneVsRestWriter(self) + + +class _OneVsRestSharedReadWrite: + @staticmethod + def saveImpl(instance, sc, path, extraMetadata=None): + skipParams = ['classifier'] + jsonParams = DefaultParamsWriter.extractJsonParams(instance, skipParams) + DefaultParamsWriter.saveMetadata(instance, path, sc, paramMap=jsonParams, + extraMetadata=extraMetadata) + classifierPath = os.path.join(path, 'classifier') + instance.getClassifier().save(classifierPath) + + @staticmethod + def loadClassifier(path, sc): + classifierPath = os.path.join(path, 'classifier') + return DefaultParamsReader.loadParamsInstance(classifierPath, sc) + + @staticmethod + def validateParams(instance): + elems_to_check = [instance.getClassifier()] + if isinstance(instance, OneVsRestModel): + elems_to_check.extend(instance.models) + + for elem in elems_to_check: + if not isinstance(elem, MLWritable): + raise ValueError(f'OneVsRest write will fail because it contains {elem.uid} ' + f'which is not writable.') + + +@inherit_doc +class OneVsRestReader(MLReader): + def __init__(self, cls): + super(OneVsRestReader, self).__init__() + self.cls = cls + + def load(self, path): + metadata = DefaultParamsReader.loadMetadata(path, self.sc) + if not DefaultParamsReader.isPythonParamsInstance(metadata): + return JavaMLReader(self.cls).load(path) + else: + classifier = _OneVsRestSharedReadWrite.loadClassifier(path, self.sc) + ova = OneVsRest(classifier=classifier)._resetUid(metadata['uid']) + DefaultParamsReader.getAndSetParams(ova, metadata, skipParams=['classifier']) + return ova + + +@inherit_doc +class OneVsRestWriter(MLWriter): + def __init__(self, instance): + super(OneVsRestWriter, self).__init__() + self.instance = instance + + def saveImpl(self, path): + _OneVsRestSharedReadWrite.validateParams(self.instance) + _OneVsRestSharedReadWrite.saveImpl(self.instance, self.sc, path) -class OneVsRestModel(Model, _OneVsRestParams, JavaMLReadable, JavaMLWritable): + +class OneVsRestModel(Model, _OneVsRestParams, MLReadable, MLWritable): """ Model fitted by OneVsRest. This stores the models resulting from training k binary classifiers: one for each class. @@ -3023,6 +3091,9 @@ def setRawPredictionCol(self, value): def __init__(self, models): super(OneVsRestModel, self).__init__() self.models = models + if not isinstance(models[0], JavaMLWritable): + return + # set java instance java_models = [model._to_java() for model in self.models] sc = SparkContext._active_spark_context java_models_array = JavaWrapper._new_java_array(java_models, @@ -3160,6 +3231,57 @@ def _to_java(self): _java_obj.set("weightCol", self.getWeightCol()) return _java_obj + @classmethod + def read(cls): + return OneVsRestModelReader(cls) + + def write(self): + if all(map(lambda elem: isinstance(elem, JavaMLWritable), + [self.getClassifier()] + self.models)): + return JavaMLWriter(self) + else: + return OneVsRestModelWriter(self) + + +@inherit_doc +class OneVsRestModelReader(MLReader): + def __init__(self, cls): + super(OneVsRestModelReader, self).__init__() + self.cls = cls + + def load(self, path): + metadata = DefaultParamsReader.loadMetadata(path, self.sc) + if not DefaultParamsReader.isPythonParamsInstance(metadata): + return JavaMLReader(self.cls).load(path) + else: + classifier = _OneVsRestSharedReadWrite.loadClassifier(path, self.sc) + numClasses = metadata['numClasses'] + subModels = [None] * numClasses + for idx in range(numClasses): + subModelPath = os.path.join(path, f'model_{idx}') + subModels[idx] = DefaultParamsReader.loadParamsInstance(subModelPath, self.sc) + ovaModel = OneVsRestModel(subModels)._resetUid(metadata['uid']) + ovaModel.set(ovaModel.classifier, classifier) + DefaultParamsReader.getAndSetParams(ovaModel, metadata, skipParams=['classifier']) + return ovaModel + + +@inherit_doc +class OneVsRestModelWriter(MLWriter): + def __init__(self, instance): + super(OneVsRestModelWriter, self).__init__() + self.instance = instance + + def saveImpl(self, path): + _OneVsRestSharedReadWrite.validateParams(self.instance) + instance = self.instance + numClasses = len(instance.models) + extraMetadata = {'numClasses': numClasses} + _OneVsRestSharedReadWrite.saveImpl(instance, self.sc, path, extraMetadata=extraMetadata) + for idx in range(numClasses): + subModelPath = os.path.join(path, f'model_{idx}') + instance.models[idx].save(subModelPath) + @inherit_doc class FMClassifier(_JavaProbabilisticClassifier, _FactorizationMachinesParams, JavaMLWritable, diff --git a/python/pyspark/ml/classification.pyi b/python/pyspark/ml/classification.pyi index c44176a13a69b..a4a3d21018ad9 100644 --- a/python/pyspark/ml/classification.pyi +++ b/python/pyspark/ml/classification.pyi @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -from typing import Any, List, Optional +from typing import Any, List, Optional, Type from pyspark.ml._typing import JM, M, P, T, ParamMap import abc @@ -53,7 +53,8 @@ from pyspark.ml.tree import ( _TreeClassifierParams, _TreeEnsembleModel, ) -from pyspark.ml.util import HasTrainingSummary, JavaMLReadable, JavaMLWritable +from pyspark.ml.util import HasTrainingSummary, JavaMLReadable, JavaMLWritable, \ + MLReader, MLReadable, MLWriter, MLWritable from pyspark.ml.wrapper import JavaPredictionModel, JavaPredictor, JavaWrapper from pyspark.ml.linalg import Matrix, Vector @@ -797,8 +798,8 @@ class OneVsRest( Estimator[OneVsRestModel], _OneVsRestParams, HasParallelism, - JavaMLReadable[OneVsRest], - JavaMLWritable, + MLReadable[OneVsRest], + MLWritable, ): def __init__( self, @@ -832,7 +833,7 @@ class OneVsRest( def copy(self, extra: Optional[ParamMap] = ...) -> OneVsRest: ... class OneVsRestModel( - Model, _OneVsRestParams, JavaMLReadable[OneVsRestModel], JavaMLWritable + Model, _OneVsRestParams, MLReadable[OneVsRestModel], MLWritable ): models: List[Transformer] def __init__(self, models: List[Transformer]) -> None: ... @@ -841,6 +842,26 @@ class OneVsRestModel( def setRawPredictionCol(self, value: str) -> OneVsRestModel: ... def copy(self, extra: Optional[ParamMap] = ...) -> OneVsRestModel: ... +class OneVsRestWriter(MLWriter): + instance: OneVsRest + def __init__(self, instance: OneVsRest) -> None: ... + def saveImpl(self, path: str) -> None: ... + +class OneVsRestReader(MLReader[OneVsRest]): + cls: Type[OneVsRest] + def __init__(self, cls: Type[OneVsRest]) -> None: ... + def load(self, path: str) -> OneVsRest: ... + +class OneVsRestModelWriter(MLWriter): + instance: OneVsRestModel + def __init__(self, instance: OneVsRestModel) -> None: ... + def saveImpl(self, path: str) -> None: ... + +class OneVsRestModelReader(MLReader[OneVsRestModel]): + cls: Type[OneVsRestModel] + def __init__(self, cls: Type[OneVsRestModel]) -> None: ... + def load(self, path: str) -> OneVsRestModel: ... + class FMClassifier( _JavaProbabilisticClassifier[FMClassificationModel], _FactorizationMachinesParams, diff --git a/python/pyspark/ml/tests/test_persistence.py b/python/pyspark/ml/tests/test_persistence.py index 0bbcfcdf50e95..77a6c0309628a 100644 --- a/python/pyspark/ml/tests/test_persistence.py +++ b/python/pyspark/ml/tests/test_persistence.py @@ -237,6 +237,11 @@ def _compare_pipelines(self, m1, m2): self.assertEqual(len(m1.models), len(m2.models)) for x, y in zip(m1.models, m2.models): self._compare_pipelines(x, y) + elif isinstance(m1, Params): + # Test on python backend Estimator/Transformer/Model/Evaluator + self.assertEqual(len(m1.params), len(m2.params)) + for p in m1.params: + self._compare_params(m1, m2, p) else: raise RuntimeError("_compare_pipelines does not yet support type: %s" % type(m1)) @@ -326,14 +331,14 @@ def test_python_transformer_pipeline_persistence(self): except OSError: pass - def test_onevsrest(self): + def _run_test_onevsrest(self, LogisticRegressionCls): temp_path = tempfile.mkdtemp() df = self.spark.createDataFrame([(0.0, 0.5, Vectors.dense(1.0, 0.8)), (1.0, 0.5, Vectors.sparse(2, [], [])), (2.0, 1.0, Vectors.dense(0.5, 0.5))] * 10, ["label", "wt", "features"]) - lr = LogisticRegression(maxIter=5, regParam=0.01) + lr = LogisticRegressionCls(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) def reload_and_compare(ovr, suffix): @@ -350,6 +355,11 @@ def reload_and_compare(ovr, suffix): reload_and_compare(OneVsRest(classifier=lr), "ovr") reload_and_compare(OneVsRest(classifier=lr).setWeightCol("wt"), "ovrw") + def test_onevsrest(self): + from pyspark.testing.mlutils import DummyLogisticRegression + self._run_test_onevsrest(LogisticRegression) + self._run_test_onevsrest(DummyLogisticRegression) + def test_decisiontree_classifier(self): dt = DecisionTreeClassifier(maxDepth=1) path = tempfile.mkdtemp() diff --git a/python/pyspark/ml/tests/test_tuning.py b/python/pyspark/ml/tests/test_tuning.py index ebd7457e4d30a..3cde34facbf9a 100644 --- a/python/pyspark/ml/tests/test_tuning.py +++ b/python/pyspark/ml/tests/test_tuning.py @@ -28,7 +28,8 @@ from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder, \ TrainValidationSplit, TrainValidationSplitModel from pyspark.sql.functions import rand -from pyspark.testing.mlutils import SparkSessionTestCase +from pyspark.testing.mlutils import DummyEvaluator, DummyLogisticRegression, \ + DummyLogisticRegressionModel, SparkSessionTestCase class HasInducedError(Params): @@ -201,7 +202,7 @@ def test_param_grid_type_coercion(self): for v in param.values(): assert(type(v) == float) - def test_save_load_trained_model(self): + def _run_test_save_load_trained_model(self, LogisticRegressionCls, LogisticRegressionModelCls): # This tests saving and loading the trained model only. # Save/load for CrossValidator will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() @@ -212,7 +213,7 @@ def test_save_load_trained_model(self): (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) - lr = LogisticRegression() + lr = LogisticRegressionCls() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator( @@ -228,7 +229,7 @@ def test_save_load_trained_model(self): lrModelPath = temp_path + "/lrModel" lrModel.save(lrModelPath) - loadedLrModel = LogisticRegressionModel.load(lrModelPath) + loadedLrModel = LogisticRegressionModelCls.load(lrModelPath) self.assertEqual(loadedLrModel.uid, lrModel.uid) self.assertEqual(loadedLrModel.intercept, lrModel.intercept) @@ -248,7 +249,12 @@ def test_save_load_trained_model(self): loadedCvModel.isSet(param) for param in loadedCvModel.params )) - def test_save_load_simple_estimator(self): + def test_save_load_trained_model(self): + self._run_test_save_load_trained_model(LogisticRegression, LogisticRegressionModel) + self._run_test_save_load_trained_model(DummyLogisticRegression, + DummyLogisticRegressionModel) + + def _run_test_save_load_simple_estimator(self, LogisticRegressionCls, evaluatorCls): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), @@ -258,9 +264,9 @@ def test_save_load_simple_estimator(self): (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) - lr = LogisticRegression() + lr = LogisticRegressionCls() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() - evaluator = BinaryClassificationEvaluator() + evaluator = evaluatorCls() # test save/load of CrossValidator cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) @@ -278,6 +284,12 @@ def test_save_load_simple_estimator(self): loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid) + def test_save_load_simple_estimator(self): + self._run_test_save_load_simple_estimator( + LogisticRegression, BinaryClassificationEvaluator) + self._run_test_save_load_simple_estimator( + DummyLogisticRegression, DummyEvaluator) + def test_parallel_evaluation(self): dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), @@ -343,7 +355,7 @@ def checkSubModels(subModels): for j in range(len(grid)): self.assertEqual(cvModel.subModels[i][j].uid, cvModel3.subModels[i][j].uid) - def test_save_load_nested_estimator(self): + def _run_test_save_load_nested_estimator(self, LogisticRegressionCls): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), @@ -353,9 +365,9 @@ def test_save_load_nested_estimator(self): (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) - ova = OneVsRest(classifier=LogisticRegression()) - lr1 = LogisticRegression().setMaxIter(100) - lr2 = LogisticRegression().setMaxIter(150) + ova = OneVsRest(classifier=LogisticRegressionCls()) + lr1 = LogisticRegressionCls().setMaxIter(100) + lr2 = LogisticRegressionCls().setMaxIter(150) grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build() evaluator = MulticlassClassificationEvaluator() @@ -385,7 +397,11 @@ def test_save_load_nested_estimator(self): self.assert_param_maps_equal(loadedModel.getEstimatorParamMaps(), grid) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid) - def test_save_load_pipeline_estimator(self): + def test_save_load_nested_estimator(self): + self._run_test_save_load_nested_estimator(LogisticRegression) + self._run_test_save_load_nested_estimator(DummyLogisticRegression) + + def _run_test_save_load_pipeline_estimator(self, LogisticRegressionCls): temp_path = tempfile.mkdtemp() training = self.spark.createDataFrame([ (0, "a b c d e spark", 1.0), @@ -402,9 +418,9 @@ def test_save_load_pipeline_estimator(self): tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") - ova = OneVsRest(classifier=LogisticRegression()) - lr1 = LogisticRegression().setMaxIter(5) - lr2 = LogisticRegression().setMaxIter(10) + ova = OneVsRest(classifier=LogisticRegressionCls()) + lr1 = LogisticRegressionCls().setMaxIter(5) + lr2 = LogisticRegressionCls().setMaxIter(10) pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) @@ -464,6 +480,10 @@ def test_save_load_pipeline_estimator(self): original_nested_pipeline_model.stages): self.assertEqual(loadedStage.uid, originalStage.uid) + def test_save_load_pipeline_estimator(self): + self._run_test_save_load_pipeline_estimator(LogisticRegression) + self._run_test_save_load_pipeline_estimator(DummyLogisticRegression) + def test_user_specified_folds(self): from pyspark.sql import functions as F @@ -593,7 +613,7 @@ def test_fit_maximize_metric(self): "validationMetrics has the same size of grid parameter") self.assertEqual(1.0, max(validationMetrics)) - def test_save_load_trained_model(self): + def _run_test_save_load_trained_model(self, LogisticRegressionCls, LogisticRegressionModelCls): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() @@ -604,7 +624,7 @@ def test_save_load_trained_model(self): (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) - lr = LogisticRegression() + lr = LogisticRegressionCls() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit( @@ -619,7 +639,7 @@ def test_save_load_trained_model(self): lrModelPath = temp_path + "/lrModel" lrModel.save(lrModelPath) - loadedLrModel = LogisticRegressionModel.load(lrModelPath) + loadedLrModel = LogisticRegressionModelCls.load(lrModelPath) self.assertEqual(loadedLrModel.uid, lrModel.uid) self.assertEqual(loadedLrModel.intercept, lrModel.intercept) @@ -636,7 +656,12 @@ def test_save_load_trained_model(self): loadedTvsModel.isSet(param) for param in loadedTvsModel.params )) - def test_save_load_simple_estimator(self): + def test_save_load_trained_model(self): + self._run_test_save_load_trained_model(LogisticRegression, LogisticRegressionModel) + self._run_test_save_load_trained_model(DummyLogisticRegression, + DummyLogisticRegressionModel) + + def _run_test_save_load_simple_estimator(self, LogisticRegressionCls, evaluatorCls): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() @@ -647,9 +672,9 @@ def test_save_load_simple_estimator(self): (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) - lr = LogisticRegression() + lr = LogisticRegressionCls() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() - evaluator = BinaryClassificationEvaluator() + evaluator = evaluatorCls() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) @@ -666,6 +691,12 @@ def test_save_load_simple_estimator(self): loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid) + def test_save_load_simple_estimator(self): + self._run_test_save_load_simple_estimator( + LogisticRegression, BinaryClassificationEvaluator) + self._run_test_save_load_simple_estimator( + DummyLogisticRegression, DummyEvaluator) + def test_parallel_evaluation(self): dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), @@ -718,7 +749,7 @@ def test_expose_sub_models(self): for i in range(len(grid)): self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid) - def test_save_load_nested_estimator(self): + def _run_test_save_load_nested_estimator(self, LogisticRegressionCls): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() @@ -729,9 +760,9 @@ def test_save_load_nested_estimator(self): (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) - ova = OneVsRest(classifier=LogisticRegression()) - lr1 = LogisticRegression().setMaxIter(100) - lr2 = LogisticRegression().setMaxIter(150) + ova = OneVsRest(classifier=LogisticRegressionCls()) + lr1 = LogisticRegressionCls().setMaxIter(100) + lr2 = LogisticRegressionCls().setMaxIter(150) grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build() evaluator = MulticlassClassificationEvaluator() @@ -759,7 +790,11 @@ def test_save_load_nested_estimator(self): self.assert_param_maps_equal(loadedModel.getEstimatorParamMaps(), grid) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid) - def test_save_load_pipeline_estimator(self): + def test_save_load_nested_estimator(self): + self._run_test_save_load_nested_estimator(LogisticRegression) + self._run_test_save_load_nested_estimator(DummyLogisticRegression) + + def _run_test_save_load_pipeline_estimator(self, LogisticRegressionCls): temp_path = tempfile.mkdtemp() training = self.spark.createDataFrame([ (0, "a b c d e spark", 1.0), @@ -776,9 +811,9 @@ def test_save_load_pipeline_estimator(self): tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") - ova = OneVsRest(classifier=LogisticRegression()) - lr1 = LogisticRegression().setMaxIter(5) - lr2 = LogisticRegression().setMaxIter(10) + ova = OneVsRest(classifier=LogisticRegressionCls()) + lr1 = LogisticRegressionCls().setMaxIter(5) + lr2 = LogisticRegressionCls().setMaxIter(10) pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) @@ -836,6 +871,10 @@ def test_save_load_pipeline_estimator(self): original_nested_pipeline_model.stages): self.assertEqual(loadedStage.uid, originalStage.uid) + def test_save_load_pipeline_estimator(self): + self._run_test_save_load_pipeline_estimator(LogisticRegression) + self._run_test_save_load_pipeline_estimator(DummyLogisticRegression) + def test_copy(self): dataset = self.spark.createDataFrame([ (10, 10.0), diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 2b5a9857b0f18..2c083182de470 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -15,6 +15,7 @@ # limitations under the License. # +import os import sys import itertools from multiprocessing.pool import ThreadPool @@ -22,12 +23,13 @@ import numpy as np from pyspark import keyword_only, since, SparkContext -from pyspark.ml import Estimator, Model -from pyspark.ml.common import _py2java, _java2py +from pyspark.ml import Estimator, Transformer, Model +from pyspark.ml.common import inherit_doc, _py2java, _java2py +from pyspark.ml.evaluation import Evaluator from pyspark.ml.param import Params, Param, TypeConverters from pyspark.ml.param.shared import HasCollectSubModels, HasParallelism, HasSeed -from pyspark.ml.util import MLReadable, MLWritable, JavaMLWriter, JavaMLReader, \ - MetaAlgorithmReadWrite +from pyspark.ml.util import DefaultParamsReader, DefaultParamsWriter, MetaAlgorithmReadWrite, \ + MLReadable, MLReader, MLWritable, MLWriter, JavaMLReader, JavaMLWriter from pyspark.ml.wrapper import JavaParams, JavaEstimator, JavaWrapper from pyspark.sql.functions import col, lit, rand, UserDefinedFunction from pyspark.sql.types import BooleanType @@ -229,6 +231,7 @@ def _to_java_impl(self): class _ValidatorSharedReadWrite: + @staticmethod def meta_estimator_transfer_param_maps_to_java(pyEstimator, pyParamMaps): pyStages = MetaAlgorithmReadWrite.getAllNestedStages(pyEstimator) @@ -275,10 +278,8 @@ def meta_estimator_transfer_param_maps_from_java(pyEstimator, javaParamMaps): raise ValueError('Resolve param in estimatorParamMaps failed: ' + javaParam.parent() + '.' + javaParam.name()) javaValue = javaPair.value() - if sc._jvm.Class.forName("org.apache.spark.ml.PipelineStage").isInstance(javaValue): - # Note: JavaParams._from_java support both JavaEstimator/JavaTransformer class - # and Estimator/Transformer class which implements `_from_java` static method - # (such as OneVsRest, Pipeline class). + if sc._jvm.Class.forName("org.apache.spark.ml.util.DefaultParamsWritable") \ + .isInstance(javaValue): pyValue = JavaParams._from_java(javaValue) else: pyValue = _java2py(sc, javaValue) @@ -286,6 +287,222 @@ def meta_estimator_transfer_param_maps_from_java(pyEstimator, javaParamMaps): pyParamMaps.append(pyParamMap) return pyParamMaps + @staticmethod + def is_java_convertible(instance): + allNestedStages = MetaAlgorithmReadWrite.getAllNestedStages(instance.getEstimator()) + evaluator_convertible = isinstance(instance.getEvaluator(), JavaParams) + estimator_convertible = all(map(lambda stage: hasattr(stage, '_to_java'), allNestedStages)) + return estimator_convertible and evaluator_convertible + + @staticmethod + def saveImpl(path, instance, sc, extraMetadata=None): + numParamsNotJson = 0 + jsonEstimatorParamMaps = [] + for paramMap in instance.getEstimatorParamMaps(): + jsonParamMap = [] + for p, v in paramMap.items(): + jsonParam = {'parent': p.parent, 'name': p.name} + if (isinstance(v, Estimator) and not MetaAlgorithmReadWrite.isMetaEstimator(v)) \ + or isinstance(v, Transformer) or isinstance(v, Evaluator): + relative_path = f'epm_{p.name}{numParamsNotJson}' + param_path = os.path.join(path, relative_path) + numParamsNotJson += 1 + v.save(param_path) + jsonParam['value'] = relative_path + jsonParam['isJson'] = False + elif isinstance(v, MLWritable): + raise RuntimeError( + "ValidatorSharedReadWrite.saveImpl does not handle parameters of type: " + "MLWritable that are not Estimaor/Evaluator/Transformer, and if parameter " + "is estimator, it cannot be meta estimator such as Validator or OneVsRest") + else: + jsonParam['value'] = v + jsonParam['isJson'] = True + jsonParamMap.append(jsonParam) + jsonEstimatorParamMaps.append(jsonParamMap) + + skipParams = ['estimator', 'evaluator', 'estimatorParamMaps'] + jsonParams = DefaultParamsWriter.extractJsonParams(instance, skipParams) + jsonParams['estimatorParamMaps'] = jsonEstimatorParamMaps + + DefaultParamsWriter.saveMetadata(instance, path, sc, extraMetadata, jsonParams) + evaluatorPath = os.path.join(path, 'evaluator') + instance.getEvaluator().save(evaluatorPath) + estimatorPath = os.path.join(path, 'estimator') + instance.getEstimator().save(estimatorPath) + + @staticmethod + def load(path, sc, metadata): + evaluatorPath = os.path.join(path, 'evaluator') + evaluator = DefaultParamsReader.loadParamsInstance(evaluatorPath, sc) + estimatorPath = os.path.join(path, 'estimator') + estimator = DefaultParamsReader.loadParamsInstance(estimatorPath, sc) + + uidToParams = MetaAlgorithmReadWrite.getUidMap(estimator) + uidToParams[evaluator.uid] = evaluator + + jsonEstimatorParamMaps = metadata['paramMap']['estimatorParamMaps'] + + estimatorParamMaps = [] + for jsonParamMap in jsonEstimatorParamMaps: + paramMap = {} + for jsonParam in jsonParamMap: + est = uidToParams[jsonParam['parent']] + param = getattr(est, jsonParam['name']) + if 'isJson' not in jsonParam or ('isJson' in jsonParam and jsonParam['isJson']): + value = jsonParam['value'] + else: + relativePath = jsonParam['value'] + valueSavedPath = os.path.join(path, relativePath) + value = DefaultParamsReader.loadParamsInstance(valueSavedPath, sc) + paramMap[param] = value + estimatorParamMaps.append(paramMap) + + return metadata, estimator, evaluator, estimatorParamMaps + + @staticmethod + def validateParams(instance): + estiamtor = instance.getEstimator() + evaluator = instance.getEvaluator() + uidMap = MetaAlgorithmReadWrite.getUidMap(estiamtor) + + for elem in [evaluator] + list(uidMap.values()): + if not isinstance(elem, MLWritable): + raise ValueError(f'Validator write will fail because it contains {elem.uid} ' + f'which is not writable.') + + estimatorParamMaps = instance.getEstimatorParamMaps() + paramErr = 'Validator save requires all Params in estimatorParamMaps to apply to ' \ + f'its Estimator, An extraneous Param was found: ' + for paramMap in estimatorParamMaps: + for param in paramMap: + if param.parent not in uidMap: + raise ValueError(paramErr + repr(param)) + + @staticmethod + def getValidatorModelWriterPersistSubModelsParam(writer): + if 'persistsubmodels' in writer.optionMap: + persistSubModelsParam = writer.optionMap['persistsubmodels'].lower() + if persistSubModelsParam == 'true': + return True + elif persistSubModelsParam == 'false': + return False + else: + raise ValueError( + f'persistSubModels option value {persistSubModelsParam} is invalid, ' + f"the possible values are True, 'True' or False, 'False'") + else: + return writer.instance.subModels is not None + + +_save_with_persist_submodels_no_submodels_found_err = \ + 'When persisting tuning models, you can only set persistSubModels to true if the tuning ' \ + 'was done with collectSubModels set to true. To save the sub-models, try rerunning fitting ' \ + 'with collectSubModels set to true.' + + +@inherit_doc +class CrossValidatorReader(MLReader): + + def __init__(self, cls): + super(CrossValidatorReader, self).__init__() + self.cls = cls + + def load(self, path): + metadata = DefaultParamsReader.loadMetadata(path, self.sc) + if not DefaultParamsReader.isPythonParamsInstance(metadata): + return JavaMLReader(self.cls).load(path) + else: + metadata, estimator, evaluator, estimatorParamMaps = \ + _ValidatorSharedReadWrite.load(path, self.sc, metadata) + cv = CrossValidator(estimator=estimator, + estimatorParamMaps=estimatorParamMaps, + evaluator=evaluator) + cv = cv._resetUid(metadata['uid']) + DefaultParamsReader.getAndSetParams(cv, metadata, skipParams=['estimatorParamMaps']) + return cv + + +@inherit_doc +class CrossValidatorWriter(MLWriter): + + def __init__(self, instance): + super(CrossValidatorWriter, self).__init__() + self.instance = instance + + def saveImpl(self, path): + _ValidatorSharedReadWrite.validateParams(self.instance) + _ValidatorSharedReadWrite.saveImpl(path, self.instance, self.sc) + + +@inherit_doc +class CrossValidatorModelReader(MLReader): + + def __init__(self, cls): + super(CrossValidatorModelReader, self).__init__() + self.cls = cls + + def load(self, path): + metadata = DefaultParamsReader.loadMetadata(path, self.sc) + if not DefaultParamsReader.isPythonParamsInstance(metadata): + return JavaMLReader(self.cls).load(path) + else: + metadata, estimator, evaluator, estimatorParamMaps = \ + _ValidatorSharedReadWrite.load(path, self.sc, metadata) + numFolds = metadata['paramMap']['numFolds'] + bestModelPath = os.path.join(path, 'bestModel') + bestModel = DefaultParamsReader.loadParamsInstance(bestModelPath, self.sc) + avgMetrics = metadata['avgMetrics'] + persistSubModels = ('persistSubModels' in metadata) and metadata['persistSubModels'] + + if persistSubModels: + subModels = [[None] * len(estimatorParamMaps)] * numFolds + for splitIndex in range(numFolds): + for paramIndex in range(len(estimatorParamMaps)): + modelPath = os.path.join( + path, 'subModels', f'fold{splitIndex}', f'{paramIndex}') + subModels[splitIndex][paramIndex] = \ + DefaultParamsReader.loadParamsInstance(modelPath, self.sc) + else: + subModels = None + + cvModel = CrossValidatorModel(bestModel, avgMetrics=avgMetrics, subModels=subModels) + cvModel = cvModel._resetUid(metadata['uid']) + cvModel.set(cvModel.estimator, estimator) + cvModel.set(cvModel.estimatorParamMaps, estimatorParamMaps) + cvModel.set(cvModel.evaluator, evaluator) + DefaultParamsReader.getAndSetParams( + cvModel, metadata, skipParams=['estimatorParamMaps']) + return cvModel + + +@inherit_doc +class CrossValidatorModelWriter(MLWriter): + + def __init__(self, instance): + super(CrossValidatorModelWriter, self).__init__() + self.instance = instance + + def saveImpl(self, path): + _ValidatorSharedReadWrite.validateParams(self.instance) + instance = self.instance + persistSubModels = _ValidatorSharedReadWrite \ + .getValidatorModelWriterPersistSubModelsParam(self) + extraMetadata = {'avgMetrics': instance.avgMetrics, + 'persistSubModels': persistSubModels} + _ValidatorSharedReadWrite.saveImpl(path, instance, self.sc, extraMetadata=extraMetadata) + bestModelPath = os.path.join(path, 'bestModel') + instance.bestModel.save(bestModelPath) + if persistSubModels: + if instance.subModels is None: + raise ValueError(_save_with_persist_submodels_no_submodels_found_err) + subModelsPath = os.path.join(path, 'subModels') + for splitIndex in range(instance.getNumFolds()): + splitPath = os.path.join(subModelsPath, f'fold{splitIndex}') + for paramIndex in range(len(instance.getEstimatorParamMaps())): + modelPath = os.path.join(splitPath, f'{paramIndex}') + instance.subModels[splitIndex][paramIndex].save(modelPath) + class _CrossValidatorParams(_ValidatorParams): """ @@ -553,13 +770,15 @@ def copy(self, extra=None): @since("2.3.0") def write(self): """Returns an MLWriter instance for this ML instance.""" - return JavaMLWriter(self) + if _ValidatorSharedReadWrite.is_java_convertible(self): + return JavaMLWriter(self) + return CrossValidatorWriter(self) @classmethod @since("2.3.0") def read(cls): """Returns an MLReader instance for this class.""" - return JavaMLReader(cls) + return CrossValidatorReader(cls) @classmethod def _from_java(cls, java_stage): @@ -662,13 +881,15 @@ def copy(self, extra=None): @since("2.3.0") def write(self): """Returns an MLWriter instance for this ML instance.""" - return JavaMLWriter(self) + if _ValidatorSharedReadWrite.is_java_convertible(self): + return JavaMLWriter(self) + return CrossValidatorModelWriter(self) @classmethod @since("2.3.0") def read(cls): """Returns an MLReader instance for this class.""" - return JavaMLReader(cls) + return CrossValidatorModelReader(cls) @classmethod def _from_java(cls, java_stage): @@ -738,6 +959,106 @@ def _to_java(self): return _java_obj +@inherit_doc +class TrainValidationSplitReader(MLReader): + + def __init__(self, cls): + super(TrainValidationSplitReader, self).__init__() + self.cls = cls + + def load(self, path): + metadata = DefaultParamsReader.loadMetadata(path, self.sc) + if not DefaultParamsReader.isPythonParamsInstance(metadata): + return JavaMLReader(self.cls).load(path) + else: + metadata, estimator, evaluator, estimatorParamMaps = \ + _ValidatorSharedReadWrite.load(path, self.sc, metadata) + tvs = TrainValidationSplit(estimator=estimator, + estimatorParamMaps=estimatorParamMaps, + evaluator=evaluator) + tvs = tvs._resetUid(metadata['uid']) + DefaultParamsReader.getAndSetParams(tvs, metadata, skipParams=['estimatorParamMaps']) + return tvs + + +@inherit_doc +class TrainValidationSplitWriter(MLWriter): + + def __init__(self, instance): + super(TrainValidationSplitWriter, self).__init__() + self.instance = instance + + def saveImpl(self, path): + _ValidatorSharedReadWrite.validateParams(self.instance) + _ValidatorSharedReadWrite.saveImpl(path, self.instance, self.sc) + + +@inherit_doc +class TrainValidationSplitModelReader(MLReader): + + def __init__(self, cls): + super(TrainValidationSplitModelReader, self).__init__() + self.cls = cls + + def load(self, path): + metadata = DefaultParamsReader.loadMetadata(path, self.sc) + if not DefaultParamsReader.isPythonParamsInstance(metadata): + return JavaMLReader(self.cls).load(path) + else: + metadata, estimator, evaluator, estimatorParamMaps = \ + _ValidatorSharedReadWrite.load(path, self.sc, metadata) + bestModelPath = os.path.join(path, 'bestModel') + bestModel = DefaultParamsReader.loadParamsInstance(bestModelPath, self.sc) + validationMetrics = metadata['validationMetrics'] + persistSubModels = ('persistSubModels' in metadata) and metadata['persistSubModels'] + + if persistSubModels: + subModels = [None] * len(estimatorParamMaps) + for paramIndex in range(len(estimatorParamMaps)): + modelPath = os.path.join(path, 'subModels', f'{paramIndex}') + subModels[paramIndex] = \ + DefaultParamsReader.loadParamsInstance(modelPath, self.sc) + else: + subModels = None + + tvsModel = TrainValidationSplitModel( + bestModel, validationMetrics=validationMetrics, subModels=subModels) + tvsModel = tvsModel._resetUid(metadata['uid']) + tvsModel.set(tvsModel.estimator, estimator) + tvsModel.set(tvsModel.estimatorParamMaps, estimatorParamMaps) + tvsModel.set(tvsModel.evaluator, evaluator) + DefaultParamsReader.getAndSetParams( + tvsModel, metadata, skipParams=['estimatorParamMaps']) + return tvsModel + + +@inherit_doc +class TrainValidationSplitModelWriter(MLWriter): + + def __init__(self, instance): + super(TrainValidationSplitModelWriter, self).__init__() + self.instance = instance + + def saveImpl(self, path): + _ValidatorSharedReadWrite.validateParams(self.instance) + instance = self.instance + persistSubModels = _ValidatorSharedReadWrite \ + .getValidatorModelWriterPersistSubModelsParam(self) + + extraMetadata = {'validationMetrics': instance.validationMetrics, + 'persistSubModels': persistSubModels} + _ValidatorSharedReadWrite.saveImpl(path, instance, self.sc, extraMetadata=extraMetadata) + bestModelPath = os.path.join(path, 'bestModel') + instance.bestModel.save(bestModelPath) + if persistSubModels: + if instance.subModels is None: + raise ValueError(_save_with_persist_submodels_no_submodels_found_err) + subModelsPath = os.path.join(path, 'subModels') + for paramIndex in range(len(instance.getEstimatorParamMaps())): + modelPath = os.path.join(subModelsPath, f'{paramIndex}') + instance.subModels[paramIndex].save(modelPath) + + class _TrainValidationSplitParams(_ValidatorParams): """ Params for :py:class:`TrainValidationSplit` and :py:class:`TrainValidationSplitModel`. @@ -942,13 +1263,15 @@ def copy(self, extra=None): @since("2.3.0") def write(self): """Returns an MLWriter instance for this ML instance.""" - return JavaMLWriter(self) + if _ValidatorSharedReadWrite.is_java_convertible(self): + return JavaMLWriter(self) + return TrainValidationSplitWriter(self) @classmethod @since("2.3.0") def read(cls): """Returns an MLReader instance for this class.""" - return JavaMLReader(cls) + return TrainValidationSplitReader(cls) @classmethod def _from_java(cls, java_stage): @@ -1046,13 +1369,15 @@ def copy(self, extra=None): @since("2.3.0") def write(self): """Returns an MLWriter instance for this ML instance.""" - return JavaMLWriter(self) + if _ValidatorSharedReadWrite.is_java_convertible(self): + return JavaMLWriter(self) + return TrainValidationSplitModelWriter(self) @classmethod @since("2.3.0") def read(cls): """Returns an MLReader instance for this class.""" - return JavaMLReader(cls) + return TrainValidationSplitModelReader(cls) @classmethod def _from_java(cls, java_stage): diff --git a/python/pyspark/ml/tuning.pyi b/python/pyspark/ml/tuning.pyi index 63cd75f0e1d74..e5f153d49e9c6 100644 --- a/python/pyspark/ml/tuning.pyi +++ b/python/pyspark/ml/tuning.pyi @@ -183,3 +183,43 @@ class TrainValidationSplitModel( def write(self) -> MLWriter: ... @classmethod def read(cls: Type[TrainValidationSplitModel]) -> MLReader: ... + +class CrossValidatorWriter(MLWriter): + instance: CrossValidator + def __init__(self, instance: CrossValidator) -> None: ... + def saveImpl(self, path: str) -> None: ... + +class CrossValidatorReader(MLReader[CrossValidator]): + cls: Type[CrossValidator] + def __init__(self, cls: Type[CrossValidator]) -> None: ... + def load(self, path: str) -> CrossValidator: ... + +class CrossValidatorModelWriter(MLWriter): + instance: CrossValidatorModel + def __init__(self, instance: CrossValidatorModel) -> None: ... + def saveImpl(self, path: str) -> None: ... + +class CrossValidatorModelReader(MLReader[CrossValidatorModel]): + cls: Type[CrossValidatorModel] + def __init__(self, cls: Type[CrossValidatorModel]) -> None: ... + def load(self, path: str) -> CrossValidatorModel: ... + +class TrainValidationSplitWriter(MLWriter): + instance: TrainValidationSplit + def __init__(self, instance: TrainValidationSplit) -> None: ... + def saveImpl(self, path: str) -> None: ... + +class TrainValidationSplitReader(MLReader[TrainValidationSplit]): + cls: Type[TrainValidationSplit] + def __init__(self, cls: Type[TrainValidationSplit]) -> None: ... + def load(self, path: str) -> TrainValidationSplit: ... + +class TrainValidationSplitModelWriter(MLWriter): + instance: TrainValidationSplitModel + def __init__(self, instance: TrainValidationSplitModel) -> None: ... + def saveImpl(self, path: str) -> None: ... + +class TrainValidationSplitModelReader(MLReader[TrainValidationSplitModel]): + cls: Type[TrainValidationSplitModel] + def __init__(self, cls: Type[TrainValidationSplitModel]) -> None: ... + def load(self, path: str) -> TrainValidationSplitModel: ... diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py index a34bfb53482a0..156e7f0fe65e6 100644 --- a/python/pyspark/ml/util.py +++ b/python/pyspark/ml/util.py @@ -106,6 +106,7 @@ class MLWriter(BaseReadWrite): def __init__(self): super(MLWriter, self).__init__() self.shouldOverwrite = False + self.optionMap = {} def _handleOverwrite(self, path): from pyspark.ml.wrapper import JavaWrapper @@ -132,6 +133,14 @@ def overwrite(self): self.shouldOverwrite = True return self + def option(self, key, value): + """ + Adds an option to the underlying MLWriter. See the documentation for the specific model's + writer for possible options. The option name (key) is case-insensitive. + """ + self.optionMap[key.lower()] = str(value) + return self + @inherit_doc class GeneralMLWriter(MLWriter): @@ -375,6 +384,13 @@ def __init__(self, instance): def saveImpl(self, path): DefaultParamsWriter.saveMetadata(self.instance, path, self.sc) + @staticmethod + def extractJsonParams(instance, skipParams): + paramMap = instance.extractParamMap() + jsonParams = {param.name: value for param, value in paramMap.items() + if param.name not in skipParams} + return jsonParams + @staticmethod def saveMetadata(instance, path, sc, extraMetadata=None, paramMap=None): """ @@ -530,15 +546,16 @@ def _parseMetaData(metadataStr, expectedClassName=""): return metadata @staticmethod - def getAndSetParams(instance, metadata): + def getAndSetParams(instance, metadata, skipParams=None): """ Extract Params from metadata, and set them in the instance. """ # Set user-supplied param values for paramName in metadata['paramMap']: param = instance.getParam(paramName) - paramValue = metadata['paramMap'][paramName] - instance.set(param, paramValue) + if skipParams is None or paramName not in skipParams: + paramValue = metadata['paramMap'][paramName] + instance.set(param, paramValue) # Set default param values majorAndMinorVersions = VersionUtils.majorMinorVersion(metadata['sparkVersion']) @@ -554,6 +571,10 @@ def getAndSetParams(instance, metadata): paramValue = metadata['defaultParamMap'][paramName] instance._setDefault(**{paramName: paramValue}) + @staticmethod + def isPythonParamsInstance(metadata): + return metadata['class'].startswith('pyspark.ml.') + @staticmethod def loadParamsInstance(path, sc): """ @@ -561,7 +582,10 @@ def loadParamsInstance(path, sc): This assumes the instance inherits from :py:class:`MLReadable`. """ metadata = DefaultParamsReader.loadMetadata(path, sc) - pythonClassName = metadata['class'].replace("org.apache.spark", "pyspark") + if DefaultParamsReader.isPythonParamsInstance(metadata): + pythonClassName = metadata['class'] + else: + pythonClassName = metadata['class'].replace("org.apache.spark", "pyspark") py_type = DefaultParamsReader.__get_class(pythonClassName) instance = py_type.load(path) return instance @@ -630,3 +654,13 @@ def getAllNestedStages(pyInstance): nestedStages.extend(MetaAlgorithmReadWrite.getAllNestedStages(pySubStage)) return [pyInstance] + nestedStages + + @staticmethod + def getUidMap(instance): + nestedStages = MetaAlgorithmReadWrite.getAllNestedStages(instance) + uidMap = {stage.uid: stage for stage in nestedStages} + if len(nestedStages) != len(uidMap): + raise RuntimeError(f'{instance.__class__.__module__}.{instance.__class__.__name__}' + f'.load found a compound estimator with stages with duplicate ' + f'UIDs. List of UIDs: {list(uidMap.keys())}.') + return uidMap diff --git a/python/pyspark/ml/util.pyi b/python/pyspark/ml/util.pyi index e2496e181f14f..db28c095a5568 100644 --- a/python/pyspark/ml/util.pyi +++ b/python/pyspark/ml/util.pyi @@ -132,3 +132,5 @@ class MetaAlgorithmReadWrite: def isMetaEstimator(pyInstance: Any) -> bool: ... @staticmethod def getAllNestedStages(pyInstance: Any) -> list: ... + @staticmethod + def getUidMap(instance: Any) -> dict: ... diff --git a/python/pyspark/testing/mlutils.py b/python/pyspark/testing/mlutils.py index a90a64e747dea..d6edf9d64af49 100644 --- a/python/pyspark/testing/mlutils.py +++ b/python/pyspark/testing/mlutils.py @@ -17,8 +17,12 @@ import numpy as np +from pyspark import keyword_only from pyspark.ml import Estimator, Model, Transformer, UnaryTransformer +from pyspark.ml.evaluation import Evaluator from pyspark.ml.param import Param, Params, TypeConverters +from pyspark.ml.param.shared import HasMaxIter, HasRegParam +from pyspark.ml.classification import Classifier, ClassificationModel from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable from pyspark.ml.wrapper import _java2py # type: ignore from pyspark.sql import DataFrame, SparkSession @@ -161,3 +165,86 @@ def _fit(self, dataset): class MockModel(MockTransformer, Model, HasFake): pass + + +class _DummyLogisticRegressionParams(HasMaxIter, HasRegParam): + def setMaxIter(self, value): + return self._set(maxIter=value) + + def setRegParam(self, value): + return self._set(regParam=value) + + +# This is a dummy LogisticRegression used in test for python backend estimator/model +class DummyLogisticRegression(Classifier, _DummyLogisticRegressionParams, + DefaultParamsReadable, DefaultParamsWritable): + @keyword_only + def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction", + maxIter=100, regParam=0.0, rawPredictionCol="rawPrediction"): + super(DummyLogisticRegression, self).__init__() + kwargs = self._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction", + maxIter=100, regParam=0.0, rawPredictionCol="rawPrediction"): + kwargs = self._input_kwargs + self._set(**kwargs) + return self + + def _fit(self, dataset): + # Do nothing but create a dummy model + return self._copyValues(DummyLogisticRegressionModel()) + + +class DummyLogisticRegressionModel(ClassificationModel, _DummyLogisticRegressionParams, + DefaultParamsReadable, DefaultParamsWritable): + + def __init__(self): + super(DummyLogisticRegressionModel, self).__init__() + + def _transform(self, dataset): + # A dummy transform impl which always predict label 1 + from pyspark.sql.functions import array, lit + from pyspark.ml.functions import array_to_vector + rawPredCol = self.getRawPredictionCol() + if rawPredCol: + dataset = dataset.withColumn( + rawPredCol, array_to_vector(array(lit(-100.0), lit(100.0)))) + predCol = self.getPredictionCol() + if predCol: + dataset = dataset.withColumn(predCol, lit(1.0)) + + return dataset + + @property + def numClasses(self): + # a dummy implementation for test. + return 2 + + @property + def intercept(self): + # a dummy implementation for test. + return 0.0 + + # This class only used in test. The following methods/properties are not used in tests. + + @property + def coefficients(self): + raise NotImplementedError() + + def predictRaw(self, value): + raise NotImplementedError() + + def numFeatures(self): + raise NotImplementedError() + + def predict(self, value): + raise NotImplementedError() + + +class DummyEvaluator(Evaluator, DefaultParamsReadable, DefaultParamsWritable): + + def _evaluate(self, dataset): + # a dummy implementation for test. + return 1.0 From 85949588b71ed548a2e10d2e58183d9cce313a48 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Thu, 3 Dec 2020 16:43:15 -0800 Subject: [PATCH 096/150] [SPARK-33650][SQL] Fix the error from ALTER TABLE .. ADD/DROP PARTITION for non-supported partition management table ### What changes were proposed in this pull request? In the PR, I propose to change the order of post-analysis checks for the `ALTER TABLE .. ADD/DROP PARTITION` command, and perform the general check (does the table support partition management at all) before specific checks. ### Why are the changes needed? The error message for the table which doesn't support partition management can mislead users: ```java PartitionSpecs are not resolved;; 'AlterTableAddPartition [UnresolvedPartitionSpec(Map(id -> 1),None)], false +- ResolvedTable org.apache.spark.sql.connector.InMemoryTableCatalog2fd64b11, ns1.ns2.tbl, org.apache.spark.sql.connector.InMemoryTable5d3ff859 ``` because it says nothing about the root cause of the issue. ### Does this PR introduce _any_ user-facing change? Yes. After the change, the error message will be: ``` Table ns1.ns2.tbl can not alter partitions ``` ### How was this patch tested? By running the affected test suite `AlterTablePartitionV2SQLSuite`. Closes #30594 from MaxGekk/check-order-AlterTablePartition. Authored-by: Max Gekk Signed-off-by: Dongjoon Hyun --- .../sql/catalyst/analysis/CheckAnalysis.scala | 6 +++--- .../AlterTablePartitionV2SQLSuite.scala | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 61ac6346ff944..64496a953861a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -996,12 +996,12 @@ trait CheckAnalysis extends PredicateHelper { private def checkAlterTablePartition( table: Table, parts: Seq[PartitionSpec]): Unit = { (table, parts) match { - case (_, parts) if parts.exists(_.isInstanceOf[UnresolvedPartitionSpec]) => - failAnalysis("PartitionSpecs are not resolved") - case (table, _) if !table.isInstanceOf[SupportsPartitionManagement] => failAnalysis(s"Table ${table.name()} can not alter partitions.") + case (_, parts) if parts.exists(_.isInstanceOf[UnresolvedPartitionSpec]) => + failAnalysis("PartitionSpecs are not resolved") + // Skip atomic partition tables case (_: SupportsAtomicPartitionManagement, _) => case (_: SupportsPartitionManagement, parts) if parts.size > 1 => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTablePartitionV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTablePartitionV2SQLSuite.scala index 3583eceec7559..47b5e5e54edde 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTablePartitionV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTablePartitionV2SQLSuite.scala @@ -245,4 +245,20 @@ class AlterTablePartitionV2SQLSuite extends DatasourceV2SQLBase { assert(!partTable.partitionExists(expectedPartition)) } } + + test("SPARK-33650: add/drop partition into a table which doesn't support partition management") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + spark.sql(s"CREATE TABLE $t (id bigint, data string) USING _") + Seq( + s"ALTER TABLE $t ADD PARTITION (id=1)", + s"ALTER TABLE $t DROP PARTITION (id=1)" + ).foreach { alterTable => + val errMsg = intercept[AnalysisException] { + spark.sql(alterTable) + }.getMessage + assert(errMsg.contains(s"Table $t can not alter partitions")) + } + } + } } From 29e415deac3c90936dd1466eab6b001b7f1f4959 Mon Sep 17 00:00:00 2001 From: Gengliang Wang Date: Fri, 4 Dec 2020 10:58:41 +0800 Subject: [PATCH 097/150] [SPARK-33649][SQL][DOC] Improve the doc of spark.sql.ansi.enabled ### What changes were proposed in this pull request? Improve the documentation of SQL configuration `spark.sql.ansi.enabled` ### Why are the changes needed? As there are more and more new features under the SQL configuration `spark.sql.ansi.enabled`, we should make it more clear about: 1. what exactly it is 2. where can users find all the features of the ANSI mode 3. whether all the features are exactly from the SQL standard ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? It's just doc change. Closes #30593 from gengliangwang/reviseAnsiDoc. Authored-by: Gengliang Wang Signed-off-by: Gengliang Wang --- docs/sql-ref-ansi-compliance.md | 3 ++- .../scala/org/apache/spark/sql/internal/SQLConf.scala | 11 ++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index 4e19799ca75b9..c13ea2b167d93 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -21,7 +21,8 @@ license: | Since Spark 3.0, Spark SQL introduces two experimental options to comply with the SQL standard: `spark.sql.ansi.enabled` and `spark.sql.storeAssignmentPolicy` (See a table below for details). -When `spark.sql.ansi.enabled` is set to `true`, Spark SQL follows the standard in basic behaviours (e.g., arithmetic operations, type conversion, SQL functions and SQL parsing). +When `spark.sql.ansi.enabled` is set to `true`, Spark SQL uses an ANSI compliant dialect instead of being Hive compliant. For example, Spark will throw an exception at runtime instead of returning null results if the inputs to a SQL operator/function are invalid. Some ANSI dialect features may be not from the ANSI SQL standard directly, but their behaviors align with ANSI SQL's style. + Moreover, Spark SQL has an independent option to control implicit casting behaviours when inserting rows in a table. The casting behaviours are defined as store assignment rules in the standard. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index b32476a5af71a..07cd41b06de21 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2209,11 +2209,12 @@ object SQLConf { .createWithDefault(StoreAssignmentPolicy.ANSI.toString) val ANSI_ENABLED = buildConf("spark.sql.ansi.enabled") - .doc("When true, Spark tries to conform to the ANSI SQL specification: 1. Spark will " + - "throw an exception at runtime if the inputs to a SQL operator/function are invalid, " + - "e.g. overflow in arithmetic operations, out-of-range index when accessing array elements. " + - "2. Spark will forbid using the reserved keywords of ANSI SQL as identifiers in " + - "the SQL parser. 3. Spark will return NULL for null input for function `size`.") + .doc("When true, Spark SQL uses an ANSI compliant dialect instead of being Hive compliant. " + + "For example, Spark will throw an exception at runtime instead of returning null results " + + "when the inputs to a SQL operator/function are invalid." + + "For full details of this dialect, you can find them in the section \"ANSI Compliance\" of " + + "Spark's documentation. Some ANSI dialect features may be not from the ANSI SQL " + + "standard directly, but their behaviors align with ANSI SQL's style") .version("3.0.0") .booleanConf .createWithDefault(false) From e22ddb6740e73a5d1b4ef1ddd21e4241bf85f03c Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Fri, 4 Dec 2020 05:43:05 +0000 Subject: [PATCH 098/150] [SPARK-32405][SQL][FOLLOWUP] Remove USING _ in CREATE TABLE in JDBCTableCatalog docker tests ### What changes were proposed in this pull request? remove USING _ in CREATE TABLE in JDBCTableCatalog docker tests ### Why are the changes needed? Previously CREATE TABLE syntax forces users to specify a provider so we have to add a USING _ . Now the problem was fix and we need to remove it. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing tests Closes #30599 from huaxingao/remove_USING. Authored-by: Huaxin Gao Signed-off-by: Wenchen Fan --- .../spark/sql/jdbc/v2/DB2IntegrationSuite.scala | 4 ++-- .../sql/jdbc/v2/MsSqlServerIntegrationSuite.scala | 4 ++-- .../spark/sql/jdbc/v2/MySQLIntegrationSuite.scala | 6 +++--- .../spark/sql/jdbc/v2/OracleIntegrationSuite.scala | 2 +- .../spark/sql/jdbc/v2/PostgresIntegrationSuite.scala | 4 ++-- .../org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala | 12 ++++++------ 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala index 4b6461815d306..6f803b8f61dd4 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala +++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala @@ -59,7 +59,7 @@ class DB2IntegrationSuite extends DockerJDBCIntegrationSuite with V2JDBCTest { override def dataPreparation(conn: Connection): Unit = {} override def testUpdateColumnType(tbl: String): Unit = { - sql(s"CREATE TABLE $tbl (ID INTEGER) USING _") + sql(s"CREATE TABLE $tbl (ID INTEGER)") var t = spark.table(tbl) var expectedSchema = new StructType().add("ID", IntegerType) assert(t.schema === expectedSchema) @@ -75,7 +75,7 @@ class DB2IntegrationSuite extends DockerJDBCIntegrationSuite with V2JDBCTest { } override def testCreateTableWithProperty(tbl: String): Unit = { - sql(s"CREATE TABLE $tbl (ID INT) USING _" + + sql(s"CREATE TABLE $tbl (ID INT)" + s" TBLPROPERTIES('CCSID'='UNICODE')") var t = spark.table(tbl) var expectedSchema = new StructType().add("ID", IntegerType) diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerIntegrationSuite.scala index fd101607ad3ee..a7e257dbdc554 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerIntegrationSuite.scala +++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerIntegrationSuite.scala @@ -65,7 +65,7 @@ class MsSqlServerIntegrationSuite extends DockerJDBCIntegrationSuite with V2JDBC override def notSupportsTableComment: Boolean = true override def testUpdateColumnType(tbl: String): Unit = { - sql(s"CREATE TABLE $tbl (ID INTEGER) USING _") + sql(s"CREATE TABLE $tbl (ID INTEGER)") var t = spark.table(tbl) var expectedSchema = new StructType().add("ID", IntegerType) assert(t.schema === expectedSchema) @@ -81,7 +81,7 @@ class MsSqlServerIntegrationSuite extends DockerJDBCIntegrationSuite with V2JDBC } override def testUpdateColumnNullability(tbl: String): Unit = { - sql(s"CREATE TABLE $tbl (ID STRING NOT NULL) USING _") + sql(s"CREATE TABLE $tbl (ID STRING NOT NULL)") // Update nullability is unsupported for mssql db. val msg = intercept[AnalysisException] { sql(s"ALTER TABLE $tbl ALTER COLUMN ID DROP NOT NULL") diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala index a81399fc2a4f7..5f63fde7a0f58 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala +++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala @@ -67,7 +67,7 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationSuite with V2JDBCTest { } override def testUpdateColumnType(tbl: String): Unit = { - sql(s"CREATE TABLE $tbl (ID INTEGER) USING _") + sql(s"CREATE TABLE $tbl (ID INTEGER)") var t = spark.table(tbl) var expectedSchema = new StructType().add("ID", IntegerType) assert(t.schema === expectedSchema) @@ -98,7 +98,7 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationSuite with V2JDBCTest { } override def testUpdateColumnNullability(tbl: String): Unit = { - sql(s"CREATE TABLE $tbl (ID STRING NOT NULL) USING _") + sql(s"CREATE TABLE $tbl (ID STRING NOT NULL)") // Update nullability is unsupported for mysql db. val msg = intercept[AnalysisException] { sql(s"ALTER TABLE $tbl ALTER COLUMN ID DROP NOT NULL") @@ -108,7 +108,7 @@ class MySQLIntegrationSuite extends DockerJDBCIntegrationSuite with V2JDBCTest { } override def testCreateTableWithProperty(tbl: String): Unit = { - sql(s"CREATE TABLE $tbl (ID INT) USING _" + + sql(s"CREATE TABLE $tbl (ID INT)" + s" TBLPROPERTIES('ENGINE'='InnoDB', 'DEFAULT CHARACTER SET'='utf8')") var t = spark.table(tbl) var expectedSchema = new StructType().add("ID", IntegerType) diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleIntegrationSuite.scala index 403f16aac6356..241c9c1409550 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleIntegrationSuite.scala +++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleIntegrationSuite.scala @@ -73,7 +73,7 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with V2JDBCTest override def dataPreparation(conn: Connection): Unit = {} override def testUpdateColumnType(tbl: String): Unit = { - sql(s"CREATE TABLE $tbl (ID INTEGER) USING _") + sql(s"CREATE TABLE $tbl (ID INTEGER)") var t = spark.table(tbl) var expectedSchema = new StructType().add("ID", DecimalType(10, 0)) assert(t.schema === expectedSchema) diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala index df2c865e4d13b..a7fd9aa9a9868 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala +++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala @@ -52,7 +52,7 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite with V2JDBCTes override def dataPreparation(conn: Connection): Unit = {} override def testUpdateColumnType(tbl: String): Unit = { - sql(s"CREATE TABLE $tbl (ID INTEGER) USING _") + sql(s"CREATE TABLE $tbl (ID INTEGER)") var t = spark.table(tbl) var expectedSchema = new StructType().add("ID", IntegerType) assert(t.schema === expectedSchema) @@ -68,7 +68,7 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite with V2JDBCTes } override def testCreateTableWithProperty(tbl: String): Unit = { - sql(s"CREATE TABLE $tbl (ID INT) USING _" + + sql(s"CREATE TABLE $tbl (ID INT)" + s" TBLPROPERTIES('TABLESPACE'='pg_default')") var t = spark.table(tbl) var expectedSchema = new StructType().add("ID", IntegerType) diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala index e36555e514c9f..a2dd8375834bf 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala +++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala @@ -33,7 +33,7 @@ private[v2] trait V2JDBCTest extends SharedSparkSession { def notSupportsTableComment: Boolean = false def testUpdateColumnNullability(tbl: String): Unit = { - sql(s"CREATE TABLE $catalogName.alt_table (ID STRING NOT NULL) USING _") + sql(s"CREATE TABLE $catalogName.alt_table (ID STRING NOT NULL)") var t = spark.table(s"$catalogName.alt_table") // nullable is true in the expectedSchema because Spark always sets nullable to true // regardless of the JDBC metadata https://github.com/apache/spark/pull/18445 @@ -62,7 +62,7 @@ private[v2] trait V2JDBCTest extends SharedSparkSession { test("SPARK-33034: ALTER TABLE ... add new columns") { withTable(s"$catalogName.alt_table") { - sql(s"CREATE TABLE $catalogName.alt_table (ID STRING) USING _") + sql(s"CREATE TABLE $catalogName.alt_table (ID STRING)") var t = spark.table(s"$catalogName.alt_table") var expectedSchema = new StructType().add("ID", StringType) assert(t.schema === expectedSchema) @@ -89,7 +89,7 @@ private[v2] trait V2JDBCTest extends SharedSparkSession { test("SPARK-33034: ALTER TABLE ... drop column") { withTable(s"$catalogName.alt_table") { - sql(s"CREATE TABLE $catalogName.alt_table (C1 INTEGER, C2 STRING, c3 INTEGER) USING _") + sql(s"CREATE TABLE $catalogName.alt_table (C1 INTEGER, C2 STRING, c3 INTEGER)") sql(s"ALTER TABLE $catalogName.alt_table DROP COLUMN C1") sql(s"ALTER TABLE $catalogName.alt_table DROP COLUMN c3") val t = spark.table(s"$catalogName.alt_table") @@ -127,7 +127,7 @@ private[v2] trait V2JDBCTest extends SharedSparkSession { test("SPARK-33034: ALTER TABLE ... rename column") { withTable(s"$catalogName.alt_table") { sql(s"CREATE TABLE $catalogName.alt_table (ID STRING NOT NULL," + - s" ID1 STRING NOT NULL, ID2 STRING NOT NULL) USING _") + s" ID1 STRING NOT NULL, ID2 STRING NOT NULL)") testRenameColumn(s"$catalogName.alt_table") // Rename to already existing column val msg = intercept[AnalysisException] { @@ -157,7 +157,7 @@ private[v2] trait V2JDBCTest extends SharedSparkSession { withTable(s"$catalogName.new_table") { val logAppender = new LogAppender("table comment") withLogAppender(logAppender) { - sql(s"CREATE TABLE $catalogName.new_table(i INT) USING _ COMMENT 'this is a comment'") + sql(s"CREATE TABLE $catalogName.new_table(i INT) COMMENT 'this is a comment'") } val createCommentWarning = logAppender.loggingEvents .filter(_.getLevel == Level.WARN) @@ -170,7 +170,7 @@ private[v2] trait V2JDBCTest extends SharedSparkSession { test("CREATE TABLE with table property") { withTable(s"$catalogName.new_table") { val m = intercept[AnalysisException] { - sql(s"CREATE TABLE $catalogName.new_table (i INT) USING _ TBLPROPERTIES('a'='1')") + sql(s"CREATE TABLE $catalogName.new_table (i INT) TBLPROPERTIES('a'='1')") }.message assert(m.contains("Failed table creation")) testCreateTableWithProperty(s"$catalogName.new_table") From e02324f2dda3510dd229199e97c87ffdcc766a18 Mon Sep 17 00:00:00 2001 From: Linhong Liu Date: Fri, 4 Dec 2020 06:48:49 +0000 Subject: [PATCH 099/150] [SPARK-33142][SPARK-33647][SQL] Store SQL text for SQL temp view ### What changes were proposed in this pull request? Currently, in spark, the temp view is saved as its analyzed logical plan, while the permanent view is kept in HMS with its origin SQL text. As a result, permanent and temporary views have different behaviors in some cases. In this PR we store the SQL text for temporary view in order to unify the behavior between permanent and temporary views. ### Why are the changes needed? to unify the behavior between permanent and temporary views ### Does this PR introduce _any_ user-facing change? Yes, with this PR, the temporary view will be re-analyzed when it's referred. So if the underlying datasource changed, the view will also be updated. ### How was this patch tested? existing and newly added test cases Closes #30567 from linhongliu-db/SPARK-33142. Authored-by: Linhong Liu Signed-off-by: Wenchen Fan --- .../sql/catalyst/analysis/Analyzer.scala | 43 ++-- .../sql/catalyst/analysis/CheckAnalysis.scala | 2 +- .../spark/sql/catalyst/analysis/view.scala | 4 +- .../sql/catalyst/catalog/SessionCatalog.scala | 57 +++-- .../sql/catalyst/catalog/interface.scala | 48 ++++ .../plans/logical/basicLogicalOperators.scala | 42 ++- .../apache/spark/sql/internal/SQLConf.scala | 11 + .../sql/catalyst/analysis/AnalysisSuite.scala | 1 + .../catalog/SessionCatalogSuite.scala | 4 +- .../command/AnalyzeColumnCommand.scala | 5 +- .../spark/sql/execution/command/views.scala | 239 ++++++++++++++---- .../sql-tests/results/describe.sql.out | 4 +- .../sql-tests/results/group-by-filter.sql.out | 56 ++-- .../results/postgreSQL/create_view.sql.out | 28 +- .../results/show-tblproperties.sql.out | 2 + .../invalid-correlation.sql.out | 7 +- .../apache/spark/sql/CachedTableSuite.scala | 22 -- .../spark/sql/execution/SQLViewSuite.scala | 84 ++++++ .../sql/execution/SQLViewTestSuite.scala | 203 +++++++++++++++ .../SparkGetColumnsOperation.scala | 2 +- 20 files changed, 691 insertions(+), 173 deletions(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 6b06cf13262d4..ebe1004872ef6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -105,7 +105,8 @@ object FakeV2SessionCatalog extends TableCatalog { case class AnalysisContext( catalogAndNamespace: Seq[String] = Nil, nestedViewDepth: Int = 0, - relationCache: mutable.Map[Seq[String], LogicalPlan] = mutable.Map.empty) + relationCache: mutable.Map[Seq[String], LogicalPlan] = mutable.Map.empty, + referredTempViewNames: Seq[Seq[String]] = Seq.empty) object AnalysisContext { private val value = new ThreadLocal[AnalysisContext]() { @@ -117,10 +118,14 @@ object AnalysisContext { private def set(context: AnalysisContext): Unit = value.set(context) - def withAnalysisContext[A](catalogAndNamespace: Seq[String])(f: => A): A = { + def withAnalysisContext[A]( + catalogAndNamespace: Seq[String], referredTempViewNames: Seq[Seq[String]])(f: => A): A = { val originContext = value.get() val context = AnalysisContext( - catalogAndNamespace, originContext.nestedViewDepth + 1, originContext.relationCache) + catalogAndNamespace, + originContext.nestedViewDepth + 1, + originContext.relationCache, + referredTempViewNames) set(context) try f finally { set(originContext) } } @@ -838,6 +843,7 @@ class Analyzer(override val catalogManager: CatalogManager) } private def isResolvingView: Boolean = AnalysisContext.get.catalogAndNamespace.nonEmpty + private def referredTempViewNames: Seq[Seq[String]] = AnalysisContext.get.referredTempViewNames /** * Resolve relations to temp views. This is not an actual rule, and is called by @@ -882,7 +888,7 @@ class Analyzer(override val catalogManager: CatalogManager) def lookupTempView( identifier: Seq[String], isStreaming: Boolean = false): Option[LogicalPlan] = { // Permanent View can't refer to temp views, no need to lookup at all. - if (isResolvingView) return None + if (isResolvingView && !referredTempViewNames.contains(identifier)) return None val tmpView = identifier match { case Seq(part1) => v1SessionCatalog.lookupTempView(part1) @@ -894,14 +900,14 @@ class Analyzer(override val catalogManager: CatalogManager) throw new AnalysisException(s"${identifier.quoted} is not a temp view of streaming " + s"logical plan, please use batch API such as `DataFrameReader.table` to read it.") } - tmpView + tmpView.map(ResolveRelations.resolveViews) } } // If we are resolving relations insides views, we need to expand single-part relation names with // the current catalog and namespace of when the view was created. private def expandRelationName(nameParts: Seq[String]): Seq[String] = { - if (!isResolvingView) return nameParts + if (!isResolvingView || referredTempViewNames.contains(nameParts)) return nameParts if (nameParts.length == 1) { AnalysisContext.get.catalogAndNamespace :+ nameParts.head @@ -1022,23 +1028,24 @@ class Analyzer(override val catalogManager: CatalogManager) // look at `AnalysisContext.catalogAndNamespace` when resolving relations with single-part name. // If `AnalysisContext.catalogAndNamespace` is non-empty, analyzer will expand single-part names // with it, instead of current catalog and namespace. - private def resolveViews(plan: LogicalPlan): LogicalPlan = plan match { + def resolveViews(plan: LogicalPlan): LogicalPlan = plan match { // The view's child should be a logical plan parsed from the `desc.viewText`, the variable // `viewText` should be defined, or else we throw an error on the generation of the View // operator. - case view @ View(desc, _, child) if !child.resolved => + case view @ View(desc, isTempView, _, child) if !child.resolved => // Resolve all the UnresolvedRelations and Views in the child. - val newChild = AnalysisContext.withAnalysisContext(desc.viewCatalogAndNamespace) { - if (AnalysisContext.get.nestedViewDepth > conf.maxNestedViewDepth) { - view.failAnalysis(s"The depth of view ${desc.identifier} exceeds the maximum " + - s"view resolution depth (${conf.maxNestedViewDepth}). Analysis is aborted to " + - s"avoid errors. Increase the value of ${SQLConf.MAX_NESTED_VIEW_DEPTH.key} to work " + - "around this.") - } - SQLConf.withExistingConf(View.effectiveSQLConf(desc.viewSQLConfigs)) { - executeSameContext(child) + val newChild = AnalysisContext.withAnalysisContext( + desc.viewCatalogAndNamespace, desc.viewReferredTempViewNames) { + if (AnalysisContext.get.nestedViewDepth > conf.maxNestedViewDepth) { + view.failAnalysis(s"The depth of view ${desc.identifier} exceeds the maximum " + + s"view resolution depth (${conf.maxNestedViewDepth}). Analysis is aborted to " + + s"avoid errors. Increase the value of ${SQLConf.MAX_NESTED_VIEW_DEPTH.key} to " + + "work around this.") + } + SQLConf.withExistingConf(View.effectiveSQLConf(desc.viewSQLConfigs, isTempView)) { + executeSameContext(child) + } } - } view.copy(child = newChild) case p @ SubqueryAlias(_, view: View) => p.copy(child = resolveViews(view)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 64496a953861a..11c4883992560 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -407,7 +407,7 @@ trait CheckAnalysis extends PredicateHelper { // output, nor with the query column names, throw an AnalysisException. // If the view's child output can't up cast to the view output, // throw an AnalysisException, too. - case v @ View(desc, output, child) if child.resolved && !v.sameOutput(child) => + case v @ View(desc, _, output, child) if child.resolved && !v.sameOutput(child) => val queryColumnNames = desc.viewQueryColumnNames val queryOutput = if (queryColumnNames.nonEmpty) { if (output.length != queryColumnNames.length) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala index 06de023098a1c..dfadf0a539948 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala @@ -56,7 +56,7 @@ object EliminateView extends Rule[LogicalPlan] with CastSupport { override def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { // The child has the different output attributes with the View operator. Adds a Project over // the child of the view. - case v @ View(desc, output, child) if child.resolved && !v.sameOutput(child) => + case v @ View(desc, _, output, child) if child.resolved && !v.sameOutput(child) => val resolver = conf.resolver val queryColumnNames = desc.viewQueryColumnNames val queryOutput = if (queryColumnNames.nonEmpty) { @@ -83,7 +83,7 @@ object EliminateView extends Rule[LogicalPlan] with CastSupport { // The child should have the same output attributes with the View operator, so we simply // remove the View operator. - case View(_, _, child) => + case View(_, _, _, child) => child } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 01bce079610ae..29481b85e9f2e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -606,7 +606,7 @@ class SessionCatalog( * Return a local temporary view exactly as it was stored. */ def getTempView(name: String): Option[LogicalPlan] = synchronized { - tempViews.get(formatTableName(name)) + tempViews.get(formatTableName(name)).map(getTempViewPlan) } def getTempViewNames(): Seq[String] = synchronized { @@ -617,7 +617,7 @@ class SessionCatalog( * Return a global temporary view exactly as it was stored. */ def getGlobalTempView(name: String): Option[LogicalPlan] = { - globalTempViewManager.get(formatTableName(name)) + globalTempViewManager.get(formatTableName(name)).map(getTempViewPlan) } /** @@ -654,20 +654,25 @@ class SessionCatalog( def getTempViewOrPermanentTableMetadata(name: TableIdentifier): CatalogTable = synchronized { val table = formatTableName(name.table) if (name.database.isEmpty) { - getTempView(table).map { plan => - CatalogTable( - identifier = TableIdentifier(table), - tableType = CatalogTableType.VIEW, - storage = CatalogStorageFormat.empty, - schema = plan.output.toStructType) + getTempView(table).map { + case TemporaryViewRelation(metadata) => metadata + case plan => + CatalogTable( + identifier = TableIdentifier(table), + tableType = CatalogTableType.VIEW, + storage = CatalogStorageFormat.empty, + schema = plan.output.toStructType) }.getOrElse(getTableMetadata(name)) } else if (formatDatabaseName(name.database.get) == globalTempViewManager.database) { - globalTempViewManager.get(table).map { plan => - CatalogTable( - identifier = TableIdentifier(table, Some(globalTempViewManager.database)), - tableType = CatalogTableType.VIEW, - storage = CatalogStorageFormat.empty, - schema = plan.output.toStructType) + val a = globalTempViewManager.get(table) + globalTempViewManager.get(table).map { + case TemporaryViewRelation(metadata) => metadata + case plan => + CatalogTable( + identifier = TableIdentifier(table, Some(globalTempViewManager.database)), + tableType = CatalogTableType.VIEW, + storage = CatalogStorageFormat.empty, + schema = plan.output.toStructType) }.getOrElse(throw new NoSuchTableException(globalTempViewManager.database, table)) } else { getTableMetadata(name) @@ -777,13 +782,13 @@ class SessionCatalog( val table = formatTableName(name.table) if (db == globalTempViewManager.database) { globalTempViewManager.get(table).map { viewDef => - SubqueryAlias(table, db, viewDef) + SubqueryAlias(table, db, getTempViewPlan(viewDef)) }.getOrElse(throw new NoSuchTableException(db, table)) } else if (name.database.isDefined || !tempViews.contains(table)) { val metadata = externalCatalog.getTable(db, table) getRelation(metadata) } else { - SubqueryAlias(table, tempViews(table)) + SubqueryAlias(table, getTempViewPlan(tempViews(table))) } } } @@ -797,26 +802,24 @@ class SessionCatalog( val multiParts = Seq(CatalogManager.SESSION_CATALOG_NAME, db, table) if (metadata.tableType == CatalogTableType.VIEW) { - val viewText = metadata.viewText.getOrElse(sys.error("Invalid view without text.")) - val viewConfigs = metadata.viewSQLConfigs - val viewPlan = SQLConf.withExistingConf(View.effectiveSQLConf(viewConfigs)) { - parser.parsePlan(viewText) - } - - logDebug(s"'$viewText' will be used for the view($table) with configs: $viewConfigs.") // The relation is a view, so we wrap the relation by: // 1. Add a [[View]] operator over the relation to keep track of the view desc; // 2. Wrap the logical plan in a [[SubqueryAlias]] which tracks the name of the view. - val child = View( - desc = metadata, - output = metadata.schema.toAttributes, - child = viewPlan) + val child = View.fromCatalogTable(metadata, isTempView = false, parser) SubqueryAlias(multiParts, child) } else { SubqueryAlias(multiParts, UnresolvedCatalogRelation(metadata, options)) } } + def getTempViewPlan(plan: LogicalPlan): LogicalPlan = { + plan match { + case viewInfo: TemporaryViewRelation => + View.fromCatalogTable(viewInfo.tableMeta, isTempView = true, parser) + case v => v + } + } + def lookupTempView(table: String): Option[SubqueryAlias] = { val formattedTable = formatTableName(table) getTempView(formattedTable).map { view => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 621ad84f1f5ec..6743b052fb3a1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -25,6 +25,8 @@ import scala.collection.mutable import scala.util.control.NonFatal import org.apache.commons.lang3.StringUtils +import org.json4s.JsonAST.{JArray, JString} +import org.json4s.jackson.JsonMethods._ import org.apache.spark.internal.Logging import org.apache.spark.sql.AnalysisException @@ -337,6 +339,40 @@ case class CatalogTable( ) } + /** + * Return temporary view names the current view was referred. should be empty if the + * CatalogTable is not a Temporary View or created by older versions of Spark(before 3.1.0). + */ + def viewReferredTempViewNames: Seq[Seq[String]] = { + try { + properties.get(VIEW_REFERRED_TEMP_VIEW_NAMES).map { json => + parse(json).asInstanceOf[JArray].arr.map { namePartsJson => + namePartsJson.asInstanceOf[JArray].arr.map(_.asInstanceOf[JString].s) + } + }.getOrElse(Seq.empty) + } catch { + case e: Exception => + throw new AnalysisException( + "corrupted view referred temp view names in catalog", cause = Some(e)) + } + } + + /** + * Return temporary function names the current view was referred. should be empty if the + * CatalogTable is not a Temporary View or created by older versions of Spark(before 3.1.0). + */ + def viewReferredTempFunctionNames: Seq[String] = { + try { + properties.get(VIEW_REFERRED_TEMP_FUNCTION_NAMES).map { json => + parse(json).asInstanceOf[JArray].arr.map(_.asInstanceOf[JString].s) + }.getOrElse(Seq.empty) + } catch { + case e: Exception => + throw new AnalysisException( + "corrupted view referred temp functions names in catalog", cause = Some(e)) + } + } + /** Syntactic sugar to update a field in `storage`. */ def withNewStorage( locationUri: Option[URI] = storage.locationUri, @@ -432,6 +468,9 @@ object CatalogTable { val VIEW_QUERY_OUTPUT_PREFIX = VIEW_PREFIX + "query.out." val VIEW_QUERY_OUTPUT_NUM_COLUMNS = VIEW_QUERY_OUTPUT_PREFIX + "numCols" val VIEW_QUERY_OUTPUT_COLUMN_NAME_PREFIX = VIEW_QUERY_OUTPUT_PREFIX + "col." + + val VIEW_REFERRED_TEMP_VIEW_NAMES = VIEW_PREFIX + "referredTempViewNames" + val VIEW_REFERRED_TEMP_FUNCTION_NAMES = VIEW_PREFIX + "referredTempFunctionsNames" } /** @@ -667,6 +706,15 @@ case class UnresolvedCatalogRelation( override def output: Seq[Attribute] = Nil } +/** + * A wrapper to store the temporary view info, will be kept in `SessionCatalog` + * and will be transformed to `View` during analysis + */ +case class TemporaryViewRelation(tableMeta: CatalogTable) extends LeafNode { + override lazy val resolved: Boolean = false + override def output: Seq[Attribute] = Nil +} + /** * A `LogicalPlan` that represents a hive table. * diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index a524ed4ff73e9..c8b7e8651686a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -18,10 +18,11 @@ package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.AliasIdentifier -import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation +import org.apache.spark.sql.catalyst.analysis.{EliminateView, MultiInstanceRelation} import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression +import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning} import org.apache.spark.sql.catalyst.util.truncatedString @@ -437,6 +438,7 @@ case class InsertIntoDir( */ case class View( desc: CatalogTable, + isTempView: Boolean, output: Seq[Attribute], child: LogicalPlan) extends LogicalPlan with MultiInstanceRelation { @@ -451,12 +453,31 @@ case class View( override def simpleString(maxFields: Int): String = { s"View (${desc.identifier}, ${output.mkString("[", ",", "]")})" } + + override def doCanonicalize(): LogicalPlan = { + def sameOutput( + outerProject: Seq[NamedExpression], innerProject: Seq[NamedExpression]): Boolean = { + outerProject.length == innerProject.length && + outerProject.zip(innerProject).forall { + case(outer, inner) => outer.name == inner.name && outer.dataType == inner.dataType + } + } + + val eliminated = EliminateView(this) match { + case Project(viewProjectList, child @ Project(queryProjectList, _)) + if sameOutput(viewProjectList, queryProjectList) => + child + case other => other + } + eliminated.canonicalized + } } object View { - def effectiveSQLConf(configs: Map[String, String]): SQLConf = { + def effectiveSQLConf(configs: Map[String, String], isTempView: Boolean): SQLConf = { val activeConf = SQLConf.get - if (activeConf.useCurrentSQLConfigsForView) return activeConf + // For temporary view, we always use captured sql configs + if (activeConf.useCurrentSQLConfigsForView && !isTempView) return activeConf val sqlConf = new SQLConf() for ((k, v) <- configs) { @@ -467,6 +488,21 @@ object View { sqlConf.setConf(SQLConf.MAX_NESTED_VIEW_DEPTH, activeConf.maxNestedViewDepth) sqlConf } + + def fromCatalogTable( + metadata: CatalogTable, isTempView: Boolean, parser: ParserInterface): View = { + val viewText = metadata.viewText.getOrElse(sys.error("Invalid view without text.")) + val viewConfigs = metadata.viewSQLConfigs + val viewPlan = + SQLConf.withExistingConf(effectiveSQLConf(viewConfigs, isTempView = isTempView)) { + parser.parsePlan(viewText) + } + View( + desc = metadata, + isTempView = isTempView, + output = metadata.schema.toAttributes, + child = viewPlan) + } } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 07cd41b06de21..496065f85fbbf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1489,6 +1489,15 @@ object SQLConf { .booleanConf .createWithDefault(false) + val STORE_ANALYZED_PLAN_FOR_VIEW = + buildConf("spark.sql.legacy.storeAnalyzedPlanForView") + .internal() + .doc("When true, analyzed plan instead of SQL text will be stored when creating " + + "temporary view") + .version("3.1.0") + .booleanConf + .createWithDefault(false) + val STREAMING_FILE_COMMIT_PROTOCOL_CLASS = buildConf("spark.sql.streaming.commitProtocolClass") .version("2.1.0") @@ -3435,6 +3444,8 @@ class SQLConf extends Serializable with Logging { def useCurrentSQLConfigsForView: Boolean = getConf(SQLConf.USE_CURRENT_SQL_CONFIGS_FOR_VIEW) + def storeAnalyzedPlanForView: Boolean = getConf(SQLConf.STORE_ANALYZED_PLAN_FOR_VIEW) + def starSchemaDetection: Boolean = getConf(STARSCHEMA_DETECTION) def starSchemaFTRatio: Double = getConf(STARSCHEMA_FACT_TABLE_RATIO) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index 0afa811e5d590..f5bfdc5e695e0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -665,6 +665,7 @@ class AnalysisSuite extends AnalysisTest with Matchers { tableType = CatalogTableType.VIEW, storage = CatalogStorageFormat.empty, schema = StructType(Seq(StructField("a", IntegerType), StructField("b", StringType)))), + isTempView = false, output = Seq(Symbol("a").int, Symbol("b").string), child = relation) val tz = Option(conf.sessionLocalTimeZone) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala index f30ae70dceffa..98f9ce6fe9dbb 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala @@ -646,7 +646,7 @@ abstract class SessionCatalogSuite extends AnalysisTest with Eventually { // Look up a view. catalog.setCurrentDatabase("default") - val view = View(desc = metadata, output = metadata.schema.toAttributes, + val view = View(desc = metadata, isTempView = false, output = metadata.schema.toAttributes, child = CatalystSqlParser.parsePlan(metadata.viewText.get)) comparePlans(catalog.lookupRelation(TableIdentifier("view1", Some("db3"))), SubqueryAlias(Seq(CatalogManager.SESSION_CATALOG_NAME, "db3", "view1"), view)) @@ -666,7 +666,7 @@ abstract class SessionCatalogSuite extends AnalysisTest with Eventually { assert(metadata.viewText.isDefined) assert(metadata.viewCatalogAndNamespace == Seq(CatalogManager.SESSION_CATALOG_NAME, "db2")) - val view = View(desc = metadata, output = metadata.schema.toAttributes, + val view = View(desc = metadata, isTempView = false, output = metadata.schema.toAttributes, child = CatalystSqlParser.parsePlan(metadata.viewText.get)) comparePlans(catalog.lookupRelation(TableIdentifier("view2", Some("db3"))), SubqueryAlias(Seq(CatalogManager.SESSION_CATALOG_NAME, "db3", "view2"), view)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala index 3b90f807b3138..641bd26c381ad 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala @@ -61,9 +61,10 @@ case class AnalyzeColumnCommand( private def analyzeColumnInCachedData(plan: LogicalPlan, sparkSession: SparkSession): Boolean = { val cacheManager = sparkSession.sharedState.cacheManager - cacheManager.lookupCachedData(plan).map { cachedData => + val planToLookup = sparkSession.sessionState.executePlan(plan).analyzed + cacheManager.lookupCachedData(planToLookup).map { cachedData => val columnsToAnalyze = getColumnsToAnalyze( - tableIdent, cachedData.plan, columnNames, allColumns) + tableIdent, cachedData.cachedRepresentation, columnNames, allColumns) cacheManager.analyzeColumnCacheQuery(sparkSession, cachedData, columnsToAnalyze) cachedData }.isDefined diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala index a02f863a360f8..4ad5eddb83f43 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala @@ -19,16 +19,19 @@ package org.apache.spark.sql.execution.command import scala.collection.mutable +import org.json4s.JsonAST.{JArray, JString} +import org.json4s.jackson.JsonMethods._ + import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{GlobalTempView, LocalTempView, PersistedView, UnresolvedFunction, UnresolvedRelation, ViewType} -import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType, SessionCatalog} +import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType, SessionCatalog, TemporaryViewRelation} import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, SubqueryExpression} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, View} import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.NamespaceHelper import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} -import org.apache.spark.sql.types.{BooleanType, MetadataBuilder, StringType} +import org.apache.spark.sql.types.{BooleanType, MetadataBuilder, StringType, StructType} import org.apache.spark.sql.util.SchemaUtils /** @@ -107,26 +110,61 @@ case class CreateViewCommand( // When creating a permanent view, not allowed to reference temporary objects. // This should be called after `qe.assertAnalyzed()` (i.e., `child` can be resolved) - verifyTemporaryObjectsNotExists(catalog) + verifyTemporaryObjectsNotExists(catalog, isTemporary, name, child) if (viewType == LocalTempView) { - if (replace && catalog.getTempView(name.table).isDefined && - !catalog.getTempView(name.table).get.sameResult(child)) { + val samePlan = catalog.getTempView(name.table).exists { + // Don't perform sameResult check for View logical plan, since it's unresolved + case _: View => false + case other => other.sameResult(child) + } + if (replace && !samePlan) { logInfo(s"Try to uncache ${name.quotedString} before replacing.") + checkCyclicViewReference(analyzedPlan, Seq(name), name) CommandUtils.uncacheTableOrView(sparkSession, name.quotedString) } val aliasedPlan = aliasPlan(sparkSession, analyzedPlan) - catalog.createTempView(name.table, aliasedPlan, overrideIfExists = replace) + // If there is no sql text (e.g. from Dataset API), we will always store the analyzed plan + val tableDefinition = if (!conf.storeAnalyzedPlanForView && originalText.nonEmpty) { + TemporaryViewRelation( + prepareTemporaryView( + name, + sparkSession, + analyzedPlan, + aliasedPlan.schema, + originalText, + child)) + } else { + aliasedPlan + } + catalog.createTempView(name.table, tableDefinition, overrideIfExists = replace) } else if (viewType == GlobalTempView) { - if (replace && catalog.getGlobalTempView(name.table).isDefined && - !catalog.getGlobalTempView(name.table).get.sameResult(child)) { - val db = sparkSession.sessionState.conf.getConf(StaticSQLConf.GLOBAL_TEMP_DATABASE) - val globalTempView = TableIdentifier(name.table, Option(db)) - logInfo(s"Try to uncache ${globalTempView.quotedString} before replacing.") - CommandUtils.uncacheTableOrView(sparkSession, globalTempView.quotedString) + val db = sparkSession.sessionState.conf.getConf(StaticSQLConf.GLOBAL_TEMP_DATABASE) + val viewIdent = TableIdentifier(name.table, Option(db)) + val samePlan = catalog.getGlobalTempView(name.table).exists { + // Don't perform sameResult check for View logical plan, since it's unresolved + case _: View => false + case other => other.sameResult(child) + } + if (replace && !samePlan) { + logInfo(s"Try to uncache ${viewIdent.quotedString} before replacing.") + checkCyclicViewReference(analyzedPlan, Seq(viewIdent), viewIdent) + CommandUtils.uncacheTableOrView(sparkSession, viewIdent.quotedString) } val aliasedPlan = aliasPlan(sparkSession, analyzedPlan) - catalog.createGlobalTempView(name.table, aliasedPlan, overrideIfExists = replace) + val tableDefinition = if (!conf.storeAnalyzedPlanForView && originalText.nonEmpty) { + TemporaryViewRelation( + prepareTemporaryView( + viewIdent, + sparkSession, + analyzedPlan, + aliasedPlan.schema, + originalText, + child)) + } else { + aliasedPlan + } + catalog.createGlobalTempView(name.table, tableDefinition, overrideIfExists = replace) } else if (catalog.tableExists(name)) { val tableMetadata = catalog.getTableMetadata(name) if (allowExisting) { @@ -161,39 +199,6 @@ case class CreateViewCommand( Seq.empty[Row] } - /** - * Permanent views are not allowed to reference temp objects, including temp function and views - */ - private def verifyTemporaryObjectsNotExists(catalog: SessionCatalog): Unit = { - import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ - if (!isTemporary) { - // This func traverses the unresolved plan `child`. Below are the reasons: - // 1) Analyzer replaces unresolved temporary views by a SubqueryAlias with the corresponding - // logical plan. After replacement, it is impossible to detect whether the SubqueryAlias is - // added/generated from a temporary view. - // 2) The temp functions are represented by multiple classes. Most are inaccessible from this - // package (e.g., HiveGenericUDF). - def verify(child: LogicalPlan): Unit = { - child.collect { - // Disallow creating permanent views based on temporary views. - case UnresolvedRelation(nameParts, _, _) if catalog.isTempView(nameParts) => - throw new AnalysisException(s"Not allowed to create a permanent view $name by " + - s"referencing a temporary view ${nameParts.quoted}. " + - "Please create a temp view instead by CREATE TEMP VIEW") - case other if !other.resolved => other.expressions.flatMap(_.collect { - // Traverse subquery plan for any unresolved relations. - case e: SubqueryExpression => verify(e.plan) - // Disallow creating permanent views based on temporary UDFs. - case e: UnresolvedFunction if catalog.isTemporaryFunction(e.name) => - throw new AnalysisException(s"Not allowed to create a permanent view $name by " + - s"referencing a temporary function `${e.name}`") - }) - } - } - verify(child) - } - } - /** * If `userSpecifiedColumns` is defined, alias the analyzed plan to the user specified columns, * else return the analyzed plan directly. @@ -266,15 +271,26 @@ case class AlterViewAsCommand( qe.assertAnalyzed() val analyzedPlan = qe.analyzed - if (session.sessionState.catalog.alterTempViewDefinition(name, analyzedPlan)) { - // a local/global temp view has been altered, we are done. + if (session.sessionState.catalog.isTemporaryTable(name)) { + alterTemporaryView(session, analyzedPlan) } else { alterPermanentView(session, analyzedPlan) } - Seq.empty[Row] } + private def alterTemporaryView(session: SparkSession, analyzedPlan: LogicalPlan): Unit = { + val tableDefinition = if (conf.storeAnalyzedPlanForView) { + analyzedPlan + } else { + checkCyclicViewReference(analyzedPlan, Seq(name), name) + TemporaryViewRelation( + prepareTemporaryView( + name, session, analyzedPlan, analyzedPlan.schema, Some(originalText), query)) + } + session.sessionState.catalog.alterTempViewDefinition(name, tableDefinition) + } + private def alterPermanentView(session: SparkSession, analyzedPlan: LogicalPlan): Unit = { val viewMeta = session.sessionState.catalog.getTableMetadata(name) if (viewMeta.tableType != CatalogTableType.VIEW) { @@ -398,6 +414,34 @@ object ViewHelper { } } + /** + * Convert the temporary object names to `properties`. + */ + private def referredTempNamesToProps( + viewNames: Seq[Seq[String]], functionsNames: Seq[String]): Map[String, String] = { + val viewNamesJson = + JArray(viewNames.map(nameParts => JArray(nameParts.map(JString).toList)).toList) + val functionsNamesJson = JArray(functionsNames.map(JString).toList) + + val props = new mutable.HashMap[String, String] + props.put(VIEW_REFERRED_TEMP_VIEW_NAMES, compact(render(viewNamesJson))) + props.put(VIEW_REFERRED_TEMP_FUNCTION_NAMES, compact(render(functionsNamesJson))) + props.toMap + } + + /** + * Remove the temporary object names in `properties`. + */ + private def removeReferredTempNames(properties: Map[String, String]): Map[String, String] = { + // We can't use `filterKeys` here, as the map returned by `filterKeys` is not serializable, + // while `CatalogTable` should be serializable. + properties.filterNot { case (key, _) => + key.startsWith(VIEW_REFERRED_TEMP_VIEW_NAMES) || + key.startsWith(VIEW_REFERRED_TEMP_FUNCTION_NAMES) + } + } + + /** * Generate the view properties in CatalogTable, including: * 1. view default database that is used to provide the default database name on view resolution. @@ -414,7 +458,9 @@ object ViewHelper { properties: Map[String, String], session: SparkSession, analyzedPlan: LogicalPlan, - fieldNames: Array[String]): Map[String, String] = { + fieldNames: Array[String], + tempViewNames: Seq[Seq[String]] = Seq.empty, + tempFunctionNames: Seq[String] = Seq.empty): Map[String, String] = { // for createViewCommand queryOutput may be different from fieldNames val queryOutput = analyzedPlan.schema.fieldNames @@ -427,10 +473,11 @@ object ViewHelper { // Generate the view default catalog and namespace, as well as captured SQL configs. val manager = session.sessionState.catalogManager - removeSQLConfigs(removeQueryColumnNames(properties)) ++ + removeReferredTempNames(removeSQLConfigs(removeQueryColumnNames(properties))) ++ catalogAndNamespaceToProps(manager.currentCatalog.name, manager.currentNamespace) ++ sqlConfigsToProps(conf) ++ - generateQueryColumnNames(queryOutput) + generateQueryColumnNames(queryOutput) ++ + referredTempNamesToProps(tempViewNames, tempFunctionNames) } /** @@ -481,4 +528,92 @@ object ViewHelper { } } } + + + /** + * Permanent views are not allowed to reference temp objects, including temp function and views + */ + def verifyTemporaryObjectsNotExists( + catalog: SessionCatalog, + isTemporary: Boolean, + name: TableIdentifier, + child: LogicalPlan): Unit = { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + if (!isTemporary) { + val (tempViews, tempFunctions) = collectTemporaryObjects(catalog, child) + tempViews.foreach { nameParts => + throw new AnalysisException(s"Not allowed to create a permanent view $name by " + + s"referencing a temporary view ${nameParts.quoted}. " + + "Please create a temp view instead by CREATE TEMP VIEW") + } + tempFunctions.foreach { funcName => + throw new AnalysisException(s"Not allowed to create a permanent view $name by " + + s"referencing a temporary function `${funcName}`") + } + } + } + + /** + * Collect all temporary views and functions and return the identifiers separately + * This func traverses the unresolved plan `child`. Below are the reasons: + * 1) Analyzer replaces unresolved temporary views by a SubqueryAlias with the corresponding + * logical plan. After replacement, it is impossible to detect whether the SubqueryAlias is + * added/generated from a temporary view. + * 2) The temp functions are represented by multiple classes. Most are inaccessible from this + * package (e.g., HiveGenericUDF). + */ + private def collectTemporaryObjects( + catalog: SessionCatalog, child: LogicalPlan): (Seq[Seq[String]], Seq[String]) = { + def collectTempViews(child: LogicalPlan): Seq[Seq[String]] = { + child.collect { + case UnresolvedRelation(nameParts, _, _) if catalog.isTempView(nameParts) => + Seq(nameParts) + case plan if !plan.resolved => plan.expressions.flatMap(_.collect { + case e: SubqueryExpression => collectTempViews(e.plan) + }).flatten + }.flatten.distinct + } + + def collectTempFunctions(child: LogicalPlan): Seq[String] = { + child.collect { + case plan if !plan.resolved => plan.expressions.flatMap(_.collect { + case e: SubqueryExpression => collectTempFunctions(e.plan) + case e: UnresolvedFunction if catalog.isTemporaryFunction(e.name) => + Seq(e.name.funcName) + }).flatten + }.flatten.distinct + } + (collectTempViews(child), collectTempFunctions(child)) + } + + + /** + * Returns a [[CatalogTable]] that contains information for temporary view. + * Generate the view-specific properties(e.g. view default database, view query output + * column names) and store them as properties in the CatalogTable, and also creates + * the proper schema for the view. + */ + def prepareTemporaryView( + viewName: TableIdentifier, + session: SparkSession, + analyzedPlan: LogicalPlan, + viewSchema: StructType, + originalText: Option[String], + child: LogicalPlan): CatalogTable = { + + val catalog = session.sessionState.catalog + val (tempViews, tempFunctions) = collectTemporaryObjects(catalog, child) + // TBLPROPERTIES is not allowed for temporary view, so we don't use it for + // generating temporary view properties + val newProperties = generateViewProperties( + Map.empty, session, analyzedPlan, viewSchema.fieldNames, tempViews, tempFunctions) + + CatalogTable( + identifier = viewName, + tableType = CatalogTableType.VIEW, + storage = CatalogStorageFormat.empty, + schema = viewSchema, + viewText = originalText, + properties = newProperties) + } } diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out index 145c987ee5f61..2674d055ac450 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out @@ -477,7 +477,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Catalog and Namespace spark_catalog.default View Query Output Columns [a, b, c, d] -Table Properties [view.query.out.col.3=d, view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=4, view.query.out.col.1=b, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=c, view.catalogAndNamespace.part.1=default] +Table Properties [view.query.out.col.3=d, view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=4, view.referredTempViewNames=[], view.query.out.col.1=b, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=c, view.referredTempFunctionsNames=[], view.catalogAndNamespace.part.1=default] -- !query @@ -501,7 +501,7 @@ View Text SELECT * FROM t View Original Text SELECT * FROM t View Catalog and Namespace spark_catalog.default View Query Output Columns [a, b, c, d] -Table Properties [view.query.out.col.3=d, view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=4, view.query.out.col.1=b, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=c, view.catalogAndNamespace.part.1=default] +Table Properties [view.query.out.col.3=d, view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=4, view.referredTempViewNames=[], view.query.out.col.1=b, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=c, view.referredTempFunctionsNames=[], view.catalogAndNamespace.part.1=default] -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out index 89a4da116a6b3..149e031e8829c 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out @@ -795,13 +795,15 @@ IN/EXISTS predicate sub-queries can only be used in Filter/Join and a few comman : +- Project [state#x] : +- Filter (dept_id#x = outer(dept_id#x)) : +- SubqueryAlias dept -: +- Project [dept_id#x, dept_name#x, state#x] -: +- SubqueryAlias DEPT -: +- LocalRelation [dept_id#x, dept_name#x, state#x] +: +- View (`DEPT`, [dept_id#x,dept_name#x,state#x]) +: +- Project [dept_id#x, dept_name#x, state#x] +: +- SubqueryAlias DEPT +: +- LocalRelation [dept_id#x, dept_name#x, state#x] +- SubqueryAlias emp - +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] - +- SubqueryAlias EMP - +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + +- View (`EMP`, [id#x,emp_name#x,hiredate#x,salary#x,dept_id#x]) + +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + +- SubqueryAlias EMP + +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] ; @@ -821,13 +823,15 @@ IN/EXISTS predicate sub-queries can only be used in Filter/Join and a few comman : +- Project [state#x] : +- Filter (dept_id#x = outer(dept_id#x)) : +- SubqueryAlias dept -: +- Project [dept_id#x, dept_name#x, state#x] -: +- SubqueryAlias DEPT -: +- LocalRelation [dept_id#x, dept_name#x, state#x] +: +- View (`DEPT`, [dept_id#x,dept_name#x,state#x]) +: +- Project [dept_id#x, dept_name#x, state#x] +: +- SubqueryAlias DEPT +: +- LocalRelation [dept_id#x, dept_name#x, state#x] +- SubqueryAlias emp - +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] - +- SubqueryAlias EMP - +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + +- View (`EMP`, [id#x,emp_name#x,hiredate#x,salary#x,dept_id#x]) + +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + +- SubqueryAlias EMP + +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] ; @@ -846,13 +850,15 @@ IN/EXISTS predicate sub-queries can only be used in Filter/Join and a few comman : +- Distinct : +- Project [dept_id#x] : +- SubqueryAlias dept -: +- Project [dept_id#x, dept_name#x, state#x] -: +- SubqueryAlias DEPT -: +- LocalRelation [dept_id#x, dept_name#x, state#x] +: +- View (`DEPT`, [dept_id#x,dept_name#x,state#x]) +: +- Project [dept_id#x, dept_name#x, state#x] +: +- SubqueryAlias DEPT +: +- LocalRelation [dept_id#x, dept_name#x, state#x] +- SubqueryAlias emp - +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] - +- SubqueryAlias EMP - +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + +- View (`EMP`, [id#x,emp_name#x,hiredate#x,salary#x,dept_id#x]) + +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + +- SubqueryAlias EMP + +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] ; @@ -871,13 +877,15 @@ IN/EXISTS predicate sub-queries can only be used in Filter/Join and a few comman : +- Distinct : +- Project [dept_id#x] : +- SubqueryAlias dept -: +- Project [dept_id#x, dept_name#x, state#x] -: +- SubqueryAlias DEPT -: +- LocalRelation [dept_id#x, dept_name#x, state#x] +: +- View (`DEPT`, [dept_id#x,dept_name#x,state#x]) +: +- Project [dept_id#x, dept_name#x, state#x] +: +- SubqueryAlias DEPT +: +- LocalRelation [dept_id#x, dept_name#x, state#x] +- SubqueryAlias emp - +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] - +- SubqueryAlias EMP - +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + +- View (`EMP`, [id#x,emp_name#x,hiredate#x,salary#x,dept_id#x]) + +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + +- SubqueryAlias EMP + +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] ; diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out index 2fab32fa4b4eb..7d331f24b9215 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out @@ -257,7 +257,7 @@ View Text SELECT * FROM base_table View Original Text SELECT * FROM base_table View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.referredTempViewNames=[], view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.referredTempFunctionsNames=[], view.catalogAndNamespace.part.1=temp_view_test] -- !query @@ -313,7 +313,7 @@ View Text SELECT * FROM base_table View Original Text SELECT * FROM base_table View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.referredTempViewNames=[], view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.referredTempFunctionsNames=[], view.catalogAndNamespace.part.1=temp_view_test] -- !query @@ -359,7 +359,7 @@ View Original Text SELECT t1.a AS t1_a, t2.a AS t2_a WHERE t1.id = t2.id View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [t1_a, t2_a] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=t1_a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=t2_a, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=t1_a, view.query.out.numCols=2, view.referredTempViewNames=[], view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=t2_a, view.catalogAndNamespace.part.0=spark_catalog, view.referredTempFunctionsNames=[], view.catalogAndNamespace.part.1=temp_view_test] -- !query @@ -413,7 +413,7 @@ View Text SELECT * FROM base_table WHERE id IN (SELECT id FROM base_t View Original Text SELECT * FROM base_table WHERE id IN (SELECT id FROM base_table2) View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.referredTempViewNames=[], view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.referredTempFunctionsNames=[], view.catalogAndNamespace.part.1=temp_view_test] -- !query @@ -443,7 +443,7 @@ View Text SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM base_ View Original Text SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM base_table2) t2 View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [id, a] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=id, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=a, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=id, view.query.out.numCols=2, view.referredTempViewNames=[], view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=a, view.catalogAndNamespace.part.0=spark_catalog, view.referredTempFunctionsNames=[], view.catalogAndNamespace.part.1=temp_view_test] -- !query @@ -473,7 +473,7 @@ View Text SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM base_t View Original Text SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM base_table2) View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.referredTempViewNames=[], view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.referredTempFunctionsNames=[], view.catalogAndNamespace.part.1=temp_view_test] -- !query @@ -503,7 +503,7 @@ View Text SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM ba View Original Text SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM base_table2) View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.referredTempViewNames=[], view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.referredTempFunctionsNames=[], view.catalogAndNamespace.part.1=temp_view_test] -- !query @@ -533,7 +533,7 @@ View Text SELECT * FROM base_table WHERE EXISTS (SELECT 1) View Original Text SELECT * FROM base_table WHERE EXISTS (SELECT 1) View Catalog and Namespace spark_catalog.temp_view_test View Query Output Columns [a, id] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.referredTempViewNames=[], view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.referredTempFunctionsNames=[], view.catalogAndNamespace.part.1=temp_view_test] -- !query @@ -669,7 +669,7 @@ View Text SELECT * FROM t1 CROSS JOIN t2 View Original Text SELECT * FROM t1 CROSS JOIN t2 View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [num, name, num2, value] -Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.catalogAndNamespace.part.1=testviewschm2] +Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.referredTempViewNames=[], view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.referredTempFunctionsNames=[], view.catalogAndNamespace.part.1=testviewschm2] -- !query @@ -710,7 +710,7 @@ View Text SELECT * FROM t1 INNER JOIN t2 ON t1.num = t2.num2 View Original Text SELECT * FROM t1 INNER JOIN t2 ON t1.num = t2.num2 View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [num, name, num2, value] -Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.catalogAndNamespace.part.1=testviewschm2] +Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.referredTempViewNames=[], view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.referredTempFunctionsNames=[], view.catalogAndNamespace.part.1=testviewschm2] -- !query @@ -751,7 +751,7 @@ View Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 View Original Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [num, name, num2, value] -Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.catalogAndNamespace.part.1=testviewschm2] +Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.referredTempViewNames=[], view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.referredTempFunctionsNames=[], view.catalogAndNamespace.part.1=testviewschm2] -- !query @@ -792,7 +792,7 @@ View Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 AND t2.va View Original Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 AND t2.value = 'xxx' View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [num, name, num2, value] -Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.catalogAndNamespace.part.1=testviewschm2] +Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.referredTempViewNames=[], view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.referredTempFunctionsNames=[], view.catalogAndNamespace.part.1=testviewschm2] -- !query @@ -894,7 +894,7 @@ BETWEEN (SELECT d FROM tbl2 WHERE c = 1) AND (SELECT e FROM tbl3 WHERE f = 2) AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [a, b] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=b, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=testviewschm2] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.referredTempViewNames=[], view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=b, view.catalogAndNamespace.part.0=spark_catalog, view.referredTempFunctionsNames=[], view.catalogAndNamespace.part.1=testviewschm2] -- !query @@ -933,7 +933,7 @@ AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) AND NOT EXISTS (SELECT g FROM tbl4 LEFT JOIN tmptbl ON tbl4.h = tmptbl.j) View Catalog and Namespace spark_catalog.testviewschm2 View Query Output Columns [a, b] -Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=b, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=testviewschm2] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.referredTempViewNames=[], view.sqlConfig.spark.sql.ansi.enabled=true, view.query.out.col.1=b, view.catalogAndNamespace.part.0=spark_catalog, view.referredTempFunctionsNames=[], view.catalogAndNamespace.part.1=testviewschm2] -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/show-tblproperties.sql.out b/sql/core/src/test/resources/sql-tests/results/show-tblproperties.sql.out index eaaf894590d35..3fb948056dc01 100644 --- a/sql/core/src/test/resources/sql-tests/results/show-tblproperties.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/show-tblproperties.sql.out @@ -64,6 +64,8 @@ view.catalogAndNamespace.part.0 spark_catalog view.catalogAndNamespace.part.1 default view.query.out.col.0 c1 view.query.out.numCols 1 +view.referredTempFunctionsNames [] +view.referredTempViewNames [] -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out index d703d4e9112e9..cd96eaf1b878b 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out @@ -111,7 +111,8 @@ org.apache.spark.sql.AnalysisException Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses: Aggregate [min(outer(t2a#x)) AS min(outer(t2.`t2a`))#x] +- SubqueryAlias t3 - +- Project [t3a#x, t3b#x, t3c#x] - +- SubqueryAlias t3 - +- LocalRelation [t3a#x, t3b#x, t3c#x] + +- View (`t3`, [t3a#x,t3b#x,t3c#x]) + +- Project [t3a#x, t3b#x, t3c#x] + +- SubqueryAlias t3 + +- LocalRelation [t3a#x, t3b#x, t3c#x] ; diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index ef3f4daa6dc6b..d0150616cd67e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala @@ -1239,26 +1239,4 @@ class CachedTableSuite extends QueryTest with SQLTestUtils } } } - - test("SPARK-33290: querying temporary view after REFRESH TABLE fails with FNFE") { - withTable("t") { - withTempPath { path => - withTempView("tempView1") { - Seq((1 -> "a")).toDF("i", "j").write.parquet(path.getCanonicalPath) - sql(s"CREATE TABLE t USING parquet LOCATION '${path.toURI}'") - sql("CREATE TEMPORARY VIEW tempView1 AS SELECT * FROM t") - checkAnswer(sql("SELECT * FROM tempView1"), Seq(Row(1, "a"))) - - Utils.deleteRecursively(path) - sql("REFRESH TABLE t") - checkAnswer(sql("SELECT * FROM t"), Seq.empty) - val exception = intercept[Exception] { - checkAnswer(sql("SELECT * FROM tempView1"), Seq.empty) - } - assert(exception.getMessage.contains("FileNotFoundException")) - assert(exception.getMessage.contains("REFRESH TABLE")) - } - } - } - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala index 0b19f706836be..709d6321d199d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.execution +import org.apache.spark.SparkException import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.NoSuchTableException @@ -763,6 +764,89 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { } } + test("temporary view should ignore useCurrentSQLConfigsForView config") { + withTable("t") { + Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") + withTempView("v1") { + sql("CREATE TEMPORARY VIEW v1 AS SELECT 1/0") + withSQLConf( + USE_CURRENT_SQL_CONFIGS_FOR_VIEW.key -> "true", + ANSI_ENABLED.key -> "true") { + checkAnswer(sql("SELECT * FROM v1"), Seq(Row(null))) + } + } + } + } + + test("alter temporary view should follow current storeAnalyzedPlanForView config") { + withTable("t") { + Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") + withView("v1") { + withSQLConf(STORE_ANALYZED_PLAN_FOR_VIEW.key -> "true") { + sql("CREATE TEMPORARY VIEW v1 AS SELECT * FROM t") + Seq(4, 6, 5).toDF("c1").write.mode("overwrite").format("parquet").saveAsTable("t") + val e = intercept[SparkException] { + sql("SELECT * FROM v1").collect() + }.getMessage + assert(e.contains("does not exist")) + } + + withSQLConf(STORE_ANALYZED_PLAN_FOR_VIEW.key -> "false") { + // alter view from legacy to non-legacy config + sql("ALTER VIEW v1 AS SELECT * FROM t") + Seq(1, 3, 5).toDF("c1").write.mode("overwrite").format("parquet").saveAsTable("t") + checkAnswer(sql("SELECT * FROM v1"), Seq(Row(1), Row(3), Row(5))) + } + + withSQLConf(STORE_ANALYZED_PLAN_FOR_VIEW.key -> "true") { + // alter view from non-legacy to legacy config + sql("ALTER VIEW v1 AS SELECT * FROM t") + Seq(2, 4, 6).toDF("c1").write.mode("overwrite").format("parquet").saveAsTable("t") + val e = intercept[SparkException] { + sql("SELECT * FROM v1").collect() + }.getMessage + assert(e.contains("does not exist")) + } + } + } + } + + test("local temp view refers global temp view") { + withGlobalTempView("v1") { + withTempView("v2") { + val globalTempDB = spark.sharedState.globalTempViewManager.database + sql("CREATE GLOBAL TEMPORARY VIEW v1 AS SELECT 1") + sql(s"CREATE TEMPORARY VIEW v2 AS SELECT * FROM ${globalTempDB}.v1") + checkAnswer(sql("SELECT * FROM v2"), Seq(Row(1))) + } + } + } + + test("global temp view refers local temp view") { + withTempView("v1") { + withGlobalTempView("v2") { + val globalTempDB = spark.sharedState.globalTempViewManager.database + sql("CREATE TEMPORARY VIEW v1 AS SELECT 1") + sql(s"CREATE GLOBAL TEMPORARY VIEW v2 AS SELECT * FROM v1") + checkAnswer(sql(s"SELECT * FROM ${globalTempDB}.v2"), Seq(Row(1))) + } + } + } + + test("creating local temp view should not affect existing table reference") { + withTable("t") { + withTempView("t") { + withGlobalTempView("v") { + val globalTempDB = spark.sharedState.globalTempViewManager.database + Seq(2).toDF("c1").write.format("parquet").saveAsTable("t") + sql("CREATE GLOBAL TEMPORARY VIEW v AS SELECT * FROM t") + sql("CREATE TEMPORARY VIEW t AS SELECT 1") + checkAnswer(sql(s"SELECT * FROM ${globalTempDB}.v"), Seq(Row(2))) + } + } + } + } + test("SPARK-33141: view should be parsed and analyzed with configs set when creating") { withTable("t") { withView("v1", "v2", "v3", "v4", "v5") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala new file mode 100644 index 0000000000000..fb9f5a73f6d9e --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.sql.{AnalysisException, QueryTest, Row} +import org.apache.spark.sql.internal.SQLConf._ +import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} + +/** + * A base suite contains a set of view related test cases for different kind of views + * Currently, the test cases in this suite should have same behavior across all kind of views + * TODO: Combine this with [[SQLViewSuite]] + */ +abstract class SQLViewTestSuite extends QueryTest with SQLTestUtils { + import testImplicits._ + + protected def viewTypeString: String + protected def formattedViewName(viewName: String): String + + def createView( + viewName: String, + sqlText: String, + columnNames: Seq[String] = Seq.empty, + replace: Boolean = false): String = { + val replaceString = if (replace) "OR REPLACE" else "" + val columnString = if (columnNames.nonEmpty) columnNames.mkString("(", ",", ")") else "" + sql(s"CREATE $replaceString $viewTypeString $viewName $columnString AS $sqlText") + formattedViewName(viewName) + } + + def checkViewOutput(viewName: String, expectedAnswer: Seq[Row]): Unit = { + checkAnswer(sql(s"SELECT * FROM $viewName"), expectedAnswer) + } + + test("change SQLConf should not change view behavior - caseSensitiveAnalysis") { + withTable("t") { + Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") + val viewName = createView("v1", "SELECT c1 FROM t", Seq("C1")) + withView(viewName) { + Seq("true", "false").foreach { flag => + withSQLConf(CASE_SENSITIVE.key -> flag) { + checkViewOutput(viewName, Seq(Row(2), Row(3), Row(1))) + } + } + } + } + } + + test("change SQLConf should not change view behavior - orderByOrdinal") { + withTable("t") { + Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") + val viewName = createView("v1", "SELECT c1 FROM t ORDER BY 1 ASC, c1 DESC", Seq("c1")) + withView(viewName) { + Seq("true", "false").foreach { flag => + withSQLConf(ORDER_BY_ORDINAL.key -> flag) { + checkViewOutput(viewName, Seq(Row(1), Row(2), Row(3))) + } + } + } + } + } + + test("change SQLConf should not change view behavior - groupByOrdinal") { + withTable("t") { + Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") + val viewName = createView("v1", "SELECT c1, count(c1) FROM t GROUP BY 1", Seq("c1", "count")) + withView(viewName) { + Seq("true", "false").foreach { flag => + withSQLConf(GROUP_BY_ORDINAL.key -> flag) { + checkViewOutput(viewName, Seq(Row(1, 1), Row(2, 1), Row(3, 1))) + } + } + } + } + } + + test("change SQLConf should not change view behavior - groupByAliases") { + withTable("t") { + Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") + val viewName = createView( + "v1", "SELECT c1 as a, count(c1) FROM t GROUP BY a", Seq("a", "count")) + withView(viewName) { + Seq("true", "false").foreach { flag => + withSQLConf(GROUP_BY_ALIASES.key -> flag) { + checkViewOutput(viewName, Seq(Row(1, 1), Row(2, 1), Row(3, 1))) + } + } + } + } + } + + test("change SQLConf should not change view behavior - ansiEnabled") { + withTable("t") { + Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") + val viewName = createView("v1", "SELECT 1/0", Seq("c1")) + withView(viewName) { + Seq("true", "false").foreach { flag => + withSQLConf(ANSI_ENABLED.key -> flag) { + checkViewOutput(viewName, Seq(Row(null))) + } + } + } + } + } + + test("change current database should not change view behavior") { + withTable("t") { + Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") + val viewName = createView("v1", "SELECT * from t") + withView(viewName) { + withTempDatabase { db => + sql(s"USE $db") + Seq(4, 5, 6).toDF("c1").write.format("parquet").saveAsTable("t") + checkViewOutput(viewName, Seq(Row(2), Row(3), Row(1))) + } + } + } + } + + test("view should read the new data if table is updated") { + withTable("t") { + Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") + val viewName = createView("v1", "SELECT c1 from t", Seq("c1")) + withView(viewName) { + Seq(9, 7, 8).toDF("c1").write.mode("overwrite").format("parquet").saveAsTable("t") + checkViewOutput(viewName, Seq(Row(9), Row(7), Row(8))) + } + } + } + + test("add column for table should not affect view output") { + withTable("t") { + Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") + val viewName = createView("v1", "SELECT * from t") + withView(viewName) { + sql("ALTER TABLE t ADD COLUMN (c2 INT)") + checkViewOutput(viewName, Seq(Row(2), Row(3), Row(1))) + } + } + } + + test("check cyclic view reference on CREATE OR REPLACE VIEW") { + withTable("t") { + Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") + val viewName1 = createView("v1", "SELECT * from t") + val viewName2 = createView("v2", s"SELECT * from $viewName1") + withView(viewName2, viewName1) { + val e = intercept[AnalysisException] { + createView("v1", s"SELECT * FROM $viewName2", replace = true) + }.getMessage + assert(e.contains("Recursive view")) + } + } + } + + test("check cyclic view reference on ALTER VIEW") { + withTable("t") { + Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") + val viewName1 = createView("v1", "SELECT * from t") + val viewName2 = createView("v2", s"SELECT * from $viewName1") + withView(viewName2, viewName1) { + val e = intercept[AnalysisException] { + sql(s"ALTER VIEW $viewName1 AS SELECT * FROM $viewName2") + }.getMessage + assert(e.contains("Recursive view")) + } + } + } +} + +class LocalTempViewTestSuite extends SQLViewTestSuite with SharedSparkSession { + override protected def viewTypeString: String = "TEMPORARY VIEW" + override protected def formattedViewName(viewName: String): String = viewName + +} + +class GlobalTempViewTestSuite extends SQLViewTestSuite with SharedSparkSession { + override protected def viewTypeString: String = "GLOBAL TEMPORARY VIEW" + override protected def formattedViewName(viewName: String): String = { + val globalTempDB = spark.sharedState.globalTempViewManager.database + s"$globalTempDB.$viewName" + } +} + +class PersistedViewTestSuite extends SQLViewTestSuite with SharedSparkSession { + override protected def viewTypeString: String = "VIEW" + override protected def formattedViewName(viewName: String): String = s"default.$viewName" +} diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala index 88aebb36633f6..66e6cf82922b7 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala @@ -105,7 +105,7 @@ private[hive] class SparkGetColumnsOperation( val databasePattern = Pattern.compile(CLIServiceUtils.patternToRegex(schemaName)) if (databasePattern.matcher(globalTempViewDb).matches()) { catalog.globalTempViewManager.listViewNames(tablePattern).foreach { globalTempView => - catalog.globalTempViewManager.get(globalTempView).foreach { plan => + catalog.getGlobalTempView(globalTempView).foreach { plan => addToRowSet(columnPattern, globalTempViewDb, globalTempView, plan.schema) } } From 15579ba1f82e321a694130d4c9db2a6524e9ae2e Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Fri, 4 Dec 2020 07:23:35 +0000 Subject: [PATCH 100/150] [SPARK-33430][SQL] Support namespaces in JDBC v2 Table Catalog ### What changes were proposed in this pull request? Add namespaces support in JDBC v2 Table Catalog by making ```JDBCTableCatalog``` extends```SupportsNamespaces``` ### Why are the changes needed? make v2 JDBC implementation complete ### Does this PR introduce _any_ user-facing change? Yes. Add the following to ```JDBCTableCatalog``` - listNamespaces - listNamespaces(String[] namespace) - namespaceExists(String[] namespace) - loadNamespaceMetadata(String[] namespace) - createNamespace - alterNamespace - dropNamespace ### How was this patch tested? Add new docker tests Closes #30473 from huaxingao/name_space. Authored-by: Huaxin Gao Signed-off-by: Wenchen Fan --- .../sql/jdbc/v2/PostgresNamespaceSuite.scala | 59 +++++++ .../sql/jdbc/v2/V2JDBCNamespaceTest.scala | 62 ++++++++ .../datasources/jdbc/JdbcUtils.scala | 49 ++++++ .../v2/jdbc/JDBCTableCatalog.scala | 144 +++++++++++++++++- .../apache/spark/sql/jdbc/JdbcDialects.scala | 12 +- 5 files changed, 317 insertions(+), 9 deletions(-) create mode 100644 external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala create mode 100644 external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCNamespaceTest.scala diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala new file mode 100644 index 0000000000000..e534df84ce6fa --- /dev/null +++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.jdbc.v2 + +import java.sql.Connection + +import scala.collection.JavaConverters._ + +import org.apache.spark.sql.jdbc.{DatabaseOnDocker, DockerJDBCIntegrationSuite} +import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.tags.DockerTest + +/** + * To run this test suite for a specific version (e.g., postgres:13.0): + * {{{ + * POSTGRES_DOCKER_IMAGE_NAME=postgres:13.0 + * ./build/sbt -Pdocker-integration-tests "testOnly *v2.PostgresNamespaceSuite" + * }}} + */ +@DockerTest +class PostgresNamespaceSuite extends DockerJDBCIntegrationSuite with V2JDBCNamespaceTest { + override val db = new DatabaseOnDocker { + override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:13.0-alpine") + override val env = Map( + "POSTGRES_PASSWORD" -> "rootpass" + ) + override val usesIpc = false + override val jdbcPort = 5432 + override def getJdbcUrl(ip: String, port: Int): String = + s"jdbc:postgresql://$ip:$port/postgres?user=postgres&password=rootpass" + } + + val map = new CaseInsensitiveStringMap( + Map("url" -> db.getJdbcUrl(dockerIp, externalPort), + "driver" -> "org.postgresql.Driver").asJava) + + catalog.initialize("postgresql", map) + + override def dataPreparation(conn: Connection): Unit = {} + + override def builtinNamespaces: Array[Array[String]] = { + Array(Array("information_schema"), Array("pg_catalog"), Array("public")) + } +} diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCNamespaceTest.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCNamespaceTest.scala new file mode 100644 index 0000000000000..979b0784f0448 --- /dev/null +++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCNamespaceTest.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.jdbc.v2 + +import scala.collection.JavaConverters._ + +import org.apache.log4j.Level + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.connector.catalog.NamespaceChange +import org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.tags.DockerTest + +@DockerTest +private[v2] trait V2JDBCNamespaceTest extends SharedSparkSession { + val catalog = new JDBCTableCatalog() + + def builtinNamespaces: Array[Array[String]] + + test("listNamespaces: basic behavior") { + catalog.createNamespace(Array("foo"), Map("comment" -> "test comment").asJava) + assert(catalog.listNamespaces() === Array(Array("foo")) ++ builtinNamespaces) + assert(catalog.listNamespaces(Array("foo")) === Array()) + assert(catalog.namespaceExists(Array("foo")) === true) + + val logAppender = new LogAppender("catalog comment") + withLogAppender(logAppender) { + catalog.alterNamespace(Array("foo"), NamespaceChange + .setProperty("comment", "comment for foo")) + catalog.alterNamespace(Array("foo"), NamespaceChange.removeProperty("comment")) + } + val createCommentWarning = logAppender.loggingEvents + .filter(_.getLevel == Level.WARN) + .map(_.getRenderedMessage) + .exists(_.contains("catalog comment")) + assert(createCommentWarning === false) + + catalog.dropNamespace(Array("foo")) + assert(catalog.namespaceExists(Array("foo")) === false) + assert(catalog.listNamespaces() === builtinNamespaces) + val msg = intercept[AnalysisException] { + catalog.listNamespaces(Array("foo")) + }.getMessage + assert(msg.contains("Namespace 'foo' not found")) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala index 5dd0d2bd74838..216fb02740500 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala @@ -927,6 +927,55 @@ object JdbcUtils extends Logging { } } + /** + * Creates a namespace. + */ + def createNamespace( + conn: Connection, + options: JDBCOptions, + namespace: String, + comment: String): Unit = { + val dialect = JdbcDialects.get(options.url) + executeStatement(conn, options, s"CREATE SCHEMA ${dialect.quoteIdentifier(namespace)}") + if (!comment.isEmpty) createNamespaceComment(conn, options, namespace, comment) + } + + def createNamespaceComment( + conn: Connection, + options: JDBCOptions, + namespace: String, + comment: String): Unit = { + val dialect = JdbcDialects.get(options.url) + try { + executeStatement( + conn, options, dialect.getSchemaCommentQuery(namespace, comment)) + } catch { + case e: Exception => + logWarning("Cannot create JDBC catalog comment. The catalog comment will be ignored.") + } + } + + def removeNamespaceComment( + conn: Connection, + options: JDBCOptions, + namespace: String): Unit = { + val dialect = JdbcDialects.get(options.url) + try { + executeStatement(conn, options, dialect.removeSchemaCommentQuery(namespace)) + } catch { + case e: Exception => + logWarning("Cannot drop JDBC catalog comment.") + } + } + + /** + * Drops a namespace from the JDBC database. + */ + def dropNamespace(conn: Connection, options: JDBCOptions, namespace: String): Unit = { + val dialect = JdbcDialects.get(options.url) + executeStatement(conn, options, s"DROP SCHEMA ${dialect.quoteIdentifier(namespace)}") + } + private def executeStatement(conn: Connection, options: JDBCOptions, sql: String): Unit = { val statement = conn.createStatement try { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalog.scala index 63f802363f7c0..27558e5b0d61b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalog.scala @@ -17,13 +17,16 @@ package org.apache.spark.sql.execution.datasources.v2.jdbc import java.sql.{Connection, SQLException} +import java.util import scala.collection.JavaConverters._ +import scala.collection.mutable +import scala.collection.mutable.ArrayBuilder import org.apache.spark.internal.Logging import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.analysis.{NoSuchNamespaceException, NoSuchTableException} -import org.apache.spark.sql.connector.catalog.{Identifier, Table, TableCatalog, TableChange} +import org.apache.spark.sql.catalyst.analysis.{NamespaceAlreadyExistsException, NoSuchNamespaceException, NoSuchTableException} +import org.apache.spark.sql.connector.catalog.{Identifier, NamespaceChange, SupportsNamespaces, Table, TableCatalog, TableChange} import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcOptionsInWrite, JDBCRDD, JdbcUtils} import org.apache.spark.sql.internal.SQLConf @@ -31,7 +34,8 @@ import org.apache.spark.sql.jdbc.{JdbcDialect, JdbcDialects} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap -class JDBCTableCatalog extends TableCatalog with Logging { +class JDBCTableCatalog extends TableCatalog with SupportsNamespaces with Logging { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.NamespaceHelper private var catalogName: String = null private var options: JDBCOptions = _ @@ -125,12 +129,12 @@ class JDBCTableCatalog extends TableCatalog with Logging { if (!properties.isEmpty) { properties.asScala.map { case (k, v) => k match { - case "comment" => tableComment = v - case "provider" => + case TableCatalog.PROP_COMMENT => tableComment = v + case TableCatalog.PROP_PROVIDER => throw new AnalysisException("CREATE TABLE ... USING ... is not supported in" + " JDBC catalog.") - case "owner" => // owner is ignored. It is default to current user name. - case "location" => + case TableCatalog.PROP_OWNER => // owner is ignored. It is default to current user name. + case TableCatalog.PROP_LOCATION => throw new AnalysisException("CREATE TABLE ... LOCATION ... is not supported in" + " JDBC catalog.") case _ => tableProperties = tableProperties + " " + s"$k $v" @@ -171,6 +175,132 @@ class JDBCTableCatalog extends TableCatalog with Logging { } } + override def namespaceExists(namespace: Array[String]): Boolean = namespace match { + case Array(db) => + withConnection { conn => + val rs = conn.getMetaData.getSchemas(null, db) + while (rs.next()) { + if (rs.getString(1) == db) return true; + } + false + } + case _ => false + } + + override def listNamespaces(): Array[Array[String]] = { + withConnection { conn => + val schemaBuilder = ArrayBuilder.make[Array[String]] + val rs = conn.getMetaData.getSchemas() + while (rs.next()) { + schemaBuilder += Array(rs.getString(1)) + } + schemaBuilder.result + } + } + + override def listNamespaces(namespace: Array[String]): Array[Array[String]] = { + namespace match { + case Array() => + listNamespaces() + case Array(_) if namespaceExists(namespace) => + Array() + case _ => + throw new NoSuchNamespaceException(namespace) + } + } + + override def loadNamespaceMetadata(namespace: Array[String]): util.Map[String, String] = { + namespace match { + case Array(db) => + if (!namespaceExists(namespace)) throw new NoSuchNamespaceException(db) + mutable.HashMap[String, String]().asJava + + case _ => + throw new NoSuchNamespaceException(namespace) + } + } + + override def createNamespace( + namespace: Array[String], + metadata: util.Map[String, String]): Unit = namespace match { + case Array(db) if !namespaceExists(namespace) => + var comment = "" + if (!metadata.isEmpty) { + metadata.asScala.map { + case (k, v) => k match { + case SupportsNamespaces.PROP_COMMENT => comment = v + case SupportsNamespaces.PROP_OWNER => // ignore + case SupportsNamespaces.PROP_LOCATION => + throw new AnalysisException("CREATE NAMESPACE ... LOCATION ... is not supported in" + + " JDBC catalog.") + case _ => + throw new AnalysisException(s"CREATE NAMESPACE with property $k is not supported in" + + " JDBC catalog.") + } + } + } + withConnection { conn => + classifyException(s"Failed create name space: $db") { + JdbcUtils.createNamespace(conn, options, db, comment) + } + } + + case Array(_) => + throw new NamespaceAlreadyExistsException(namespace) + + case _ => + throw new IllegalArgumentException(s"Invalid namespace name: ${namespace.quoted}") + } + + override def alterNamespace(namespace: Array[String], changes: NamespaceChange*): Unit = { + namespace match { + case Array(db) => + changes.foreach { + case set: NamespaceChange.SetProperty => + if (set.property() == SupportsNamespaces.PROP_COMMENT) { + withConnection { conn => + JdbcUtils.createNamespaceComment(conn, options, db, set.value) + } + } else { + throw new AnalysisException(s"SET NAMESPACE with property ${set.property} " + + "is not supported in JDBC catalog.") + } + + case unset: NamespaceChange.RemoveProperty => + if (unset.property() == SupportsNamespaces.PROP_COMMENT) { + withConnection { conn => + JdbcUtils.removeNamespaceComment(conn, options, db) + } + } else { + throw new AnalysisException(s"Remove NAMESPACE property ${unset.property} " + + "is not supported in JDBC catalog.") + } + + case _ => + throw new AnalysisException(s"Unsupported NamespaceChange $changes in JDBC catalog.") + } + + case _ => + throw new NoSuchNamespaceException(namespace) + } + } + + override def dropNamespace(namespace: Array[String]): Boolean = namespace match { + case Array(db) if namespaceExists(namespace) => + if (listTables(Array(db)).nonEmpty) { + throw new IllegalStateException(s"Namespace ${namespace.quoted} is not empty") + } + withConnection { conn => + classifyException(s"Failed drop name space: $db") { + JdbcUtils.dropNamespace(conn, options, db) + true + } + } + + case _ => + throw new NoSuchNamespaceException(namespace) + } + private def checkNamespace(namespace: Array[String]): Unit = { // In JDBC there is no nested database/schema if (namespace.length > 1) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala index b12882b72fb66..ead0a1aa3a243 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.jdbc -import java.sql.{Connection, Date, SQLFeatureNotSupportedException, Timestamp} +import java.sql.{Connection, Date, Timestamp} import scala.collection.mutable.ArrayBuilder @@ -232,7 +232,7 @@ abstract class JdbcDialect extends Serializable with Logging{ val name = updateNull.fieldNames updateClause += getUpdateColumnNullabilityQuery(tableName, name(0), updateNull.nullable()) case _ => - throw new SQLFeatureNotSupportedException(s"Unsupported TableChange $change") + throw new AnalysisException(s"Unsupported TableChange $change in JDBC catalog.") } } updateClause.result() @@ -270,6 +270,14 @@ abstract class JdbcDialect extends Serializable with Logging{ s"COMMENT ON TABLE $table IS '$comment'" } + def getSchemaCommentQuery(schema: String, comment: String): String = { + s"COMMENT ON SCHEMA ${quoteIdentifier(schema)} IS '$comment'" + } + + def removeSchemaCommentQuery(schema: String): String = { + s"COMMENT ON SCHEMA ${quoteIdentifier(schema)} IS NULL" + } + /** * Gets a dialect exception, classifies it and wraps it by `AnalysisException`. * @param message The error message to be placed to the returned exception. From e8380665c7e3aca446631964f49e09f264dee1c2 Mon Sep 17 00:00:00 2001 From: Gengliang Wang Date: Fri, 4 Dec 2020 16:24:41 +0900 Subject: [PATCH 101/150] [SPARK-33658][SQL] Suggest using Datetime conversion functions for invalid ANSI casting ### What changes were proposed in this pull request? Suggest users using Datetime conversion functions in the error message of invalid ANSI explicit casting. ### Why are the changes needed? In ANSI mode, explicit cast between DateTime types and Numeric types is not allowed. As of now, we have introduced new functions `UNIX_SECONDS`/`UNIX_MILLIS`/`UNIX_MICROS`/`UNIX_DATE`/`DATE_FROM_UNIX_DATE`, we can show suggestions to users so that they can complete these type conversions precisely and easily in ANSI mode. ### Does this PR introduce _any_ user-facing change? Yes, better error messages ### How was this patch tested? Unit test Closes #30603 from gengliangwang/improveErrorMsgOfExplicitCast. Authored-by: Gengliang Wang Signed-off-by: HyukjinKwon --- docs/sql-ref-ansi-compliance.md | 11 +++++++ .../spark/sql/catalyst/expressions/Cast.scala | 30 +++++++++++++++---- .../sql/catalyst/expressions/CastSuite.scala | 12 ++++++-- 3 files changed, 46 insertions(+), 7 deletions(-) diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index c13ea2b167d93..c3e17dc22eed0 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -96,6 +96,10 @@ java.lang.NumberFormatException: invalid input syntax for type numeric: a SELECT CAST(2147483648L AS INT); java.lang.ArithmeticException: Casting 2147483648 to int causes overflow +SELECT CAST(DATE'2020-01-01' AS INT) +org.apache.spark.sql.AnalysisException: cannot resolve 'CAST(DATE '2020-01-01' AS INT)' due to data type mismatch: cannot cast date to int. +To convert values from date to int, you can use function UNIX_DATE instead. + -- `spark.sql.ansi.enabled=false` (This is a default behaviour) SELECT CAST('a' AS INT); +--------------+ @@ -111,6 +115,13 @@ SELECT CAST(2147483648L AS INT); | -2147483648| +-----------------------+ +SELECT CAST(DATE'2020-01-01' AS INT) ++------------------------------+ +|CAST(DATE '2020-01-01' AS INT)| ++------------------------------+ +| null| ++------------------------------+ + -- Examples of store assignment rules CREATE TABLE t (v INT); diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 1b2e2db932970..72bd9ca4d3d1c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -1894,6 +1894,19 @@ object AnsiCast { case _ => false } + // Show suggestion on how to complete the disallowed explicit casting with built-in type + // conversion functions. + private def suggestionOnConversionFunctions ( + from: DataType, + to: DataType, + functionNames: String): String = { + // scalastyle:off line.size.limit + s"""cannot cast ${from.catalogString} to ${to.catalogString}. + |To convert values from ${from.catalogString} to ${to.catalogString}, you can use $functionNames instead. + |""".stripMargin + // scalastyle:on line.size.limit + } + def typeCheckFailureMessage( from: DataType, to: DataType, @@ -1901,12 +1914,19 @@ object AnsiCast { fallbackConfValue: String): String = (from, to) match { case (_: NumericType, TimestampType) => - // scalastyle:off line.size.limit - s""" - | cannot cast ${from.catalogString} to ${to.catalogString}. - | To convert values from ${from.catalogString} to ${to.catalogString}, you can use functions TIMESTAMP_SECONDS/TIMESTAMP_MILLIS/TIMESTAMP_MICROS instead. - |""".stripMargin + suggestionOnConversionFunctions(from, to, + "functions TIMESTAMP_SECONDS/TIMESTAMP_MILLIS/TIMESTAMP_MICROS") + + case (TimestampType, _: NumericType) => + suggestionOnConversionFunctions(from, to, "functions UNIX_SECONDS/UNIX_MILLIS/UNIX_MICROS") + + case (_: NumericType, DateType) => + suggestionOnConversionFunctions(from, to, "function DATE_FROM_UNIX_DATE") + + case (DateType, _: NumericType) => + suggestionOnConversionFunctions(from, to, "function UNIX_DATE") + // scalastyle:off line.size.limit case (_: ArrayType, StringType) => s""" | cannot cast ${from.catalogString} to ${to.catalogString} with ANSI mode on. diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala index 35db25ec9342c..e46599dc19a8b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala @@ -850,18 +850,26 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase { test("ANSI mode: disallow type conversions between Numeric types and Timestamp type") { import DataTypeTestUtils.numericTypes checkInvalidCastFromNumericType(TimestampType) + var errorMsg = + "you can use functions TIMESTAMP_SECONDS/TIMESTAMP_MILLIS/TIMESTAMP_MICROS instead" + verifyCastFailure(cast(Literal(0L), TimestampType), Some(errorMsg)) + val timestampLiteral = Literal(1L, TimestampType) + errorMsg = "you can use functions UNIX_SECONDS/UNIX_MILLIS/UNIX_MICROS instead." numericTypes.foreach { numericType => - verifyCastFailure(cast(timestampLiteral, numericType)) + verifyCastFailure(cast(timestampLiteral, numericType), Some(errorMsg)) } } test("ANSI mode: disallow type conversions between Numeric types and Date type") { import DataTypeTestUtils.numericTypes checkInvalidCastFromNumericType(DateType) + var errorMsg = "you can use function DATE_FROM_UNIX_DATE instead" + verifyCastFailure(cast(Literal(0L), DateType), Some(errorMsg)) val dateLiteral = Literal(1, DateType) + errorMsg = "you can use function UNIX_DATE instead" numericTypes.foreach { numericType => - verifyCastFailure(cast(dateLiteral, numericType)) + verifyCastFailure(cast(dateLiteral, numericType), Some(errorMsg)) } } From 94c144bdd05d6c751dcd907161e1b965e637f69c Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Fri, 4 Dec 2020 16:26:07 +0900 Subject: [PATCH 102/150] [SPARK-33571][SQL][DOCS] Add a ref to INT96 config from the doc for `spark.sql.legacy.parquet.datetimeRebaseModeInWrite/Read` ### What changes were proposed in this pull request? For the SQL configs `spark.sql.legacy.parquet.datetimeRebaseModeInWrite` and `spark.sql.legacy.parquet.datetimeRebaseModeInRead`, improve their descriptions by: 1. Explicitly document on which parquet types, those configs influence on 2. Refer to corresponding configs for `INT96` ### Why are the changes needed? To avoid user confusions like reposted in SPARK-33571, and make the config description more precise. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? By running `./dev/scalastyle`. Closes #30596 from MaxGekk/clarify-rebase-docs. Authored-by: Max Gekk Signed-off-by: HyukjinKwon --- .../apache/spark/sql/internal/SQLConf.scala | 50 +++++++++++-------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 496065f85fbbf..4442581b77811 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2746,20 +2746,6 @@ object SQLConf { .booleanConf .createWithDefault(false) - val LEGACY_PARQUET_REBASE_MODE_IN_WRITE = - buildConf("spark.sql.legacy.parquet.datetimeRebaseModeInWrite") - .internal() - .doc("When LEGACY, Spark will rebase dates/timestamps from Proleptic Gregorian calendar " + - "to the legacy hybrid (Julian + Gregorian) calendar when writing Parquet files. " + - "When CORRECTED, Spark will not do rebase and write the dates/timestamps as it is. " + - "When EXCEPTION, which is the default, Spark will fail the writing if it sees " + - "ancient dates/timestamps that are ambiguous between the two calendars.") - .version("3.0.0") - .stringConf - .transform(_.toUpperCase(Locale.ROOT)) - .checkValues(LegacyBehaviorPolicy.values.map(_.toString)) - .createWithDefault(LegacyBehaviorPolicy.EXCEPTION.toString) - val LEGACY_PARQUET_INT96_REBASE_MODE_IN_WRITE = buildConf("spark.sql.legacy.parquet.int96RebaseModeInWrite") .internal() @@ -2774,15 +2760,17 @@ object SQLConf { .checkValues(LegacyBehaviorPolicy.values.map(_.toString)) .createWithDefault(LegacyBehaviorPolicy.EXCEPTION.toString) - val LEGACY_PARQUET_REBASE_MODE_IN_READ = - buildConf("spark.sql.legacy.parquet.datetimeRebaseModeInRead") + val LEGACY_PARQUET_REBASE_MODE_IN_WRITE = + buildConf("spark.sql.legacy.parquet.datetimeRebaseModeInWrite") .internal() - .doc("When LEGACY, Spark will rebase dates/timestamps from the legacy hybrid (Julian + " + - "Gregorian) calendar to Proleptic Gregorian calendar when reading Parquet files. " + - "When CORRECTED, Spark will not do rebase and read the dates/timestamps as it is. " + - "When EXCEPTION, which is the default, Spark will fail the reading if it sees " + - "ancient dates/timestamps that are ambiguous between the two calendars. This config is " + - "only effective if the writer info (like Spark, Hive) of the Parquet files is unknown.") + .doc("When LEGACY, Spark will rebase dates/timestamps from Proleptic Gregorian calendar " + + "to the legacy hybrid (Julian + Gregorian) calendar when writing Parquet files. " + + "When CORRECTED, Spark will not do rebase and write the dates/timestamps as it is. " + + "When EXCEPTION, which is the default, Spark will fail the writing if it sees " + + "ancient dates/timestamps that are ambiguous between the two calendars. " + + "This config influences on writes of the following parquet logical types: DATE, " + + "TIMESTAMP_MILLIS, TIMESTAMP_MICROS. The INT96 type has the separate config: " + + s"${LEGACY_PARQUET_INT96_REBASE_MODE_IN_WRITE.key}.") .version("3.0.0") .stringConf .transform(_.toUpperCase(Locale.ROOT)) @@ -2804,6 +2792,24 @@ object SQLConf { .checkValues(LegacyBehaviorPolicy.values.map(_.toString)) .createWithDefault(LegacyBehaviorPolicy.EXCEPTION.toString) + val LEGACY_PARQUET_REBASE_MODE_IN_READ = + buildConf("spark.sql.legacy.parquet.datetimeRebaseModeInRead") + .internal() + .doc("When LEGACY, Spark will rebase dates/timestamps from the legacy hybrid (Julian + " + + "Gregorian) calendar to Proleptic Gregorian calendar when reading Parquet files. " + + "When CORRECTED, Spark will not do rebase and read the dates/timestamps as it is. " + + "When EXCEPTION, which is the default, Spark will fail the reading if it sees " + + "ancient dates/timestamps that are ambiguous between the two calendars. This config is " + + "only effective if the writer info (like Spark, Hive) of the Parquet files is unknown. " + + "This config influences on reads of the following parquet logical types: DATE, " + + "TIMESTAMP_MILLIS, TIMESTAMP_MICROS. The INT96 type has the separate config: " + + s"${LEGACY_PARQUET_INT96_REBASE_MODE_IN_READ.key}.") + .version("3.0.0") + .stringConf + .transform(_.toUpperCase(Locale.ROOT)) + .checkValues(LegacyBehaviorPolicy.values.map(_.toString)) + .createWithDefault(LegacyBehaviorPolicy.EXCEPTION.toString) + val LEGACY_AVRO_REBASE_MODE_IN_WRITE = buildConf("spark.sql.legacy.avro.datetimeRebaseModeInWrite") .internal() From 325abf7957373161d2cf0921d35567235186d6eb Mon Sep 17 00:00:00 2001 From: Yuanjian Li Date: Fri, 4 Dec 2020 16:45:55 +0900 Subject: [PATCH 103/150] [SPARK-33577][SS] Add support for V1Table in stream writer table API and create table if not exist by default ### What changes were proposed in this pull request? After SPARK-32896, we have table API for stream writer but only support DataSource v2 tables. Here we add the following enhancements: - Create non-existing tables by default - Support both managed and external V1Tables ### Why are the changes needed? Make the API covers more use cases. Especially for the file provider based tables. ### Does this PR introduce _any_ user-facing change? Yes, new features added. ### How was this patch tested? Add new UTs. Closes #30521 from xuanyuanking/SPARK-33577. Authored-by: Yuanjian Li Signed-off-by: Jungtaek Lim (HeartSaVioR) --- .../sql/streaming/DataStreamWriter.scala | 101 ++++++++---- .../test/DataStreamTableAPISuite.scala | 151 ++++++++++++++---- 2 files changed, 188 insertions(+), 64 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala index 9e3599712fde5..01e626e5436a4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala @@ -22,12 +22,16 @@ import java.util.concurrent.TimeoutException import scala.collection.JavaConverters._ +import org.apache.hadoop.fs.Path + import org.apache.spark.annotation.Evolving import org.apache.spark.api.java.function.VoidFunction2 import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType} +import org.apache.spark.sql.catalyst.plans.logical.CreateTableStatement import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap -import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table, TableProvider} +import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table, TableProvider, V1Table, V2TableWithV1Fallback} import org.apache.spark.sql.connector.catalog.TableCapability._ import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.DataSource @@ -298,52 +302,85 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { /** * Starts the execution of the streaming query, which will continually output results to the given - * table as new data arrives. The returned [[StreamingQuery]] object can be used to interact with - * the stream. + * table as new data arrives. A new table will be created if the table not exists. The returned + * [[StreamingQuery]] object can be used to interact with the stream. * * @since 3.1.0 */ @throws[TimeoutException] def toTable(tableName: String): StreamingQuery = { - this.source = SOURCE_NAME_TABLE this.tableName = tableName - startInternal(None) - } - private def startInternal(path: Option[String]): StreamingQuery = { - if (source.toLowerCase(Locale.ROOT) == DDLUtils.HIVE_PROVIDER) { - throw new AnalysisException("Hive data source can only be used with tables, you can not " + - "write files of Hive data source directly.") - } + import df.sparkSession.sessionState.analyzer.CatalogAndIdentifier - if (source == SOURCE_NAME_TABLE) { - assertNotPartitioned(SOURCE_NAME_TABLE) + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + val originalMultipartIdentifier = df.sparkSession.sessionState.sqlParser + .parseMultipartIdentifier(tableName) + val CatalogAndIdentifier(catalog, identifier) = originalMultipartIdentifier - import df.sparkSession.sessionState.analyzer.CatalogAndIdentifier + // Currently we don't create a logical streaming writer node in logical plan, so cannot rely + // on analyzer to resolve it. Directly lookup only for temp view to provide clearer message. + // TODO (SPARK-27484): we should add the writing node before the plan is analyzed. + if (df.sparkSession.sessionState.catalog.isTempView(originalMultipartIdentifier)) { + throw new AnalysisException(s"Temporary view $tableName doesn't support streaming write") + } + if (!catalog.asTableCatalog.tableExists(identifier)) { import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ - val originalMultipartIdentifier = df.sparkSession.sessionState.sqlParser - .parseMultipartIdentifier(tableName) - val CatalogAndIdentifier(catalog, identifier) = originalMultipartIdentifier - - // Currently we don't create a logical streaming writer node in logical plan, so cannot rely - // on analyzer to resolve it. Directly lookup only for temp view to provide clearer message. - // TODO (SPARK-27484): we should add the writing node before the plan is analyzed. - if (df.sparkSession.sessionState.catalog.isTempView(originalMultipartIdentifier)) { - throw new AnalysisException(s"Temporary view $tableName doesn't support streaming write") - } + /** + * Note, currently the new table creation by this API doesn't fully cover the V2 table. + * TODO (SPARK-33638): Full support of v2 table creation + */ + val cmd = CreateTableStatement( + originalMultipartIdentifier, + df.schema.asNullable, + partitioningColumns.getOrElse(Nil).asTransforms.toSeq, + None, + Map.empty[String, String], + Some(source), + Map.empty[String, String], + extraOptions.get("path"), + None, + None, + external = false, + ifNotExists = false) + Dataset.ofRows(df.sparkSession, cmd) + } - val tableInstance = catalog.asTableCatalog.loadTable(identifier) + val tableInstance = catalog.asTableCatalog.loadTable(identifier) - import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._ - val sink = tableInstance match { - case t: SupportsWrite if t.supports(STREAMING_WRITE) => t - case t => throw new AnalysisException(s"Table $tableName doesn't support streaming " + - s"write - $t") + def writeToV1Table(table: CatalogTable): StreamingQuery = { + if (table.tableType == CatalogTableType.VIEW) { + throw new AnalysisException(s"Streaming into views $tableName is not supported.") + } + require(table.provider.isDefined) + if (source != table.provider.get) { + throw new AnalysisException(s"The input source($source) is different from the table " + + s"$tableName's data source provider(${table.provider.get}).") } + format(table.provider.get) + .option("path", new Path(table.location).toString).start() + } - startQuery(sink, extraOptions) - } else if (source == SOURCE_NAME_MEMORY) { + import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._ + tableInstance match { + case t: SupportsWrite if t.supports(STREAMING_WRITE) => startQuery(t, extraOptions) + case t: V2TableWithV1Fallback => + writeToV1Table(t.v1Table) + case t: V1Table => + writeToV1Table(t.v1Table) + case t => throw new AnalysisException(s"Table $tableName doesn't support streaming " + + s"write - $t") + } + } + + private def startInternal(path: Option[String]): StreamingQuery = { + if (source.toLowerCase(Locale.ROOT) == DDLUtils.HIVE_PROVIDER) { + throw new AnalysisException("Hive data source can only be used with tables, you can not " + + "write files of Hive data source directly.") + } + + if (source == SOURCE_NAME_MEMORY) { assertNotPartitioned(SOURCE_NAME_MEMORY) if (extraOptions.get("queryName").isEmpty) { throw new AnalysisException("queryName must be specified for memory sink") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamTableAPISuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamTableAPISuite.scala index bf850432d5c0e..0296366f3578b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamTableAPISuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamTableAPISuite.scala @@ -26,7 +26,7 @@ import org.scalatest.BeforeAndAfter import org.apache.spark.sql.{AnalysisException, Row} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, TableAlreadyExistsException} +import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} import org.apache.spark.sql.catalyst.streaming.StreamingRelationV2 import org.apache.spark.sql.connector.{FakeV2Provider, InMemoryTableCatalog, InMemoryTableSessionCatalog} @@ -39,6 +39,7 @@ import org.apache.spark.sql.streaming.StreamTest import org.apache.spark.sql.streaming.sources.FakeScanBuilder import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.util.Utils class DataStreamTableAPISuite extends StreamTest with BeforeAndAfter { import testImplicits._ @@ -175,21 +176,24 @@ class DataStreamTableAPISuite extends StreamTest with BeforeAndAfter { test("write: write to table with custom catalog & no namespace") { val tableIdentifier = "testcat.table_name" - spark.sql(s"CREATE TABLE $tableIdentifier (id bigint, data string) USING foo") - checkAnswer(spark.table(tableIdentifier), Seq.empty) + withTable(tableIdentifier) { + spark.sql(s"CREATE TABLE $tableIdentifier (id bigint, data string) USING foo") + checkAnswer(spark.table(tableIdentifier), Seq.empty) - runTestWithStreamAppend(tableIdentifier) + runTestWithStreamAppend(tableIdentifier) + } } test("write: write to table with custom catalog & namespace") { spark.sql("CREATE NAMESPACE testcat.ns") - val tableIdentifier = "testcat.ns.table_name" - spark.sql(s"CREATE TABLE $tableIdentifier (id bigint, data string) USING foo") - checkAnswer(spark.table(tableIdentifier), Seq.empty) + withTable(tableIdentifier) { + spark.sql(s"CREATE TABLE $tableIdentifier (id bigint, data string) USING foo") + checkAnswer(spark.table(tableIdentifier), Seq.empty) - runTestWithStreamAppend(tableIdentifier) + runTestWithStreamAppend(tableIdentifier) + } } test("write: write to table with default session catalog") { @@ -200,35 +204,19 @@ class DataStreamTableAPISuite extends StreamTest with BeforeAndAfter { spark.sql("CREATE NAMESPACE ns") val tableIdentifier = "ns.table_name" - spark.sql(s"CREATE TABLE $tableIdentifier (id bigint, data string) USING $v2Source") - checkAnswer(spark.table(tableIdentifier), Seq.empty) + withTable(tableIdentifier) { + spark.sql(s"CREATE TABLE $tableIdentifier (id bigint, data string) USING $v2Source") + checkAnswer(spark.table(tableIdentifier), Seq.empty) - runTestWithStreamAppend(tableIdentifier) + runTestWithStreamAppend(tableIdentifier) + } } test("write: write to non-exist table with custom catalog") { val tableIdentifier = "testcat.nonexisttable" - spark.sql("CREATE NAMESPACE testcat.ns") - - withTempDir { checkpointDir => - val exc = intercept[NoSuchTableException] { - runStreamQueryAppendMode(tableIdentifier, checkpointDir, Seq.empty, Seq.empty) - } - assert(exc.getMessage.contains("nonexisttable")) - } - } - - test("write: write to file provider based table isn't allowed yet") { - val tableIdentifier = "table_name" - - spark.sql(s"CREATE TABLE $tableIdentifier (id bigint, data string) USING parquet") - checkAnswer(spark.table(tableIdentifier), Seq.empty) - withTempDir { checkpointDir => - val exc = intercept[AnalysisException] { - runStreamQueryAppendMode(tableIdentifier, checkpointDir, Seq.empty, Seq.empty) - } - assert(exc.getMessage.contains("doesn't support streaming write")) + withTable(tableIdentifier) { + runTestWithStreamAppend(tableIdentifier) } } @@ -262,8 +250,107 @@ class DataStreamTableAPISuite extends StreamTest with BeforeAndAfter { val exc = intercept[AnalysisException] { runStreamQueryAppendMode(viewIdentifier, checkpointDir, Seq.empty, Seq.empty) } - assert(exc.getMessage.contains("doesn't support streaming write")) + assert(exc.getMessage.contains(s"Streaming into views $viewIdentifier is not supported")) + } + } + + test("write: write to an external table") { + withTempDir { dir => + val tableName = "stream_test" + withTable(tableName) { + checkForStreamTable(Some(dir), tableName) + } + } + } + + test("write: write to a managed table") { + val tableName = "stream_test" + withTable(tableName) { + checkForStreamTable(None, tableName) + } + } + + test("write: write to an external table with existing path") { + withTempDir { dir => + val tableName = "stream_test" + withTable(tableName) { + // The file written by batch will not be seen after the table was written by a streaming + // query. This is because we loads files from the metadata log instead of listing them + // using HDFS API. + Seq(4, 5, 6).toDF("value").write.format("parquet") + .option("path", dir.getCanonicalPath).saveAsTable(tableName) + + checkForStreamTable(Some(dir), tableName) + } + } + } + + test("write: write to a managed table with existing path") { + val tableName = "stream_test" + withTable(tableName) { + // The file written by batch will not be seen after the table was written by a streaming + // query. This is because we loads files from the metadata log instead of listing them + // using HDFS API. + Seq(4, 5, 6).toDF("value").write.format("parquet").saveAsTable(tableName) + + checkForStreamTable(None, tableName) + } + } + + test("write: write to an external path and create table") { + withTempDir { dir => + val tableName = "stream_test" + withTable(tableName) { + // The file written by batch will not be seen after the table was written by a streaming + // query. This is because we loads files from the metadata log instead of listing them + // using HDFS API. + Seq(4, 5, 6).toDF("value").write + .mode("append").format("parquet").save(dir.getCanonicalPath) + + checkForStreamTable(Some(dir), tableName) + } + } + } + + test("write: write to table with different format shouldn't be allowed") { + val tableName = "stream_test" + + spark.sql(s"CREATE TABLE $tableName (id bigint, data string) USING json") + checkAnswer(spark.table(tableName), Seq.empty) + + withTempDir { checkpointDir => + val exc = intercept[AnalysisException] { + runStreamQueryAppendMode(tableName, checkpointDir, Seq.empty, Seq.empty) + } + assert(exc.getMessage.contains("The input source(parquet) is different from the table " + + s"$tableName's data source provider(json)")) + } + } + + private def checkForStreamTable(dir: Option[File], tableName: String): Unit = { + val memory = MemoryStream[Int] + val dsw = memory.toDS().writeStream.format("parquet") + dir.foreach { output => + dsw.option("path", output.getCanonicalPath) + } + val sq = dsw + .option("checkpointLocation", Utils.createTempDir().getCanonicalPath) + .toTable(tableName) + memory.addData(1, 2, 3) + sq.processAllAvailable() + + checkDataset( + spark.table(tableName).as[Int], + 1, 2, 3) + val catalogTable = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName)) + val path = if (dir.nonEmpty) { + dir.get + } else { + new File(catalogTable.location) } + checkDataset( + spark.read.format("parquet").load(path.getCanonicalPath).as[Int], + 1, 2, 3) } private def runTestWithStreamAppend(tableIdentifier: String) = { From 91baab77f7e0a5102ac069846f0e2920bb2dd15a Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Thu, 3 Dec 2020 23:47:43 -0800 Subject: [PATCH 104/150] [SPARK-33656][TESTS] Add option to keep container after tests finish for DockerJDBCIntegrationSuites for debug ### What changes were proposed in this pull request? This PR add an option to keep container after DockerJDBCIntegrationSuites (e.g. DB2IntegrationSuite, PostgresIntegrationSuite) finish. By setting a system property `spark.test.docker.keepContainer` to `true`, we can use this option. ### Why are the changes needed? If some error occur during the tests, it would be useful to keep the container for debug. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? I confirmed that the container is kept after the test by the following commands. ``` # With sbt $ build/sbt -Dspark.test.docker.keepContainer=true -Pdocker-integration-tests -Phive -Phive-thriftserver package "testOnly org.apache.spark.sql.jdbc.MariaDBKrbIntegrationSuite" # With Maven $ build/mvn -Dspark.test.docker.keepContainer=true -Pdocker-integration-tests -Phive -Phive-thriftserver -Dtest=none -DwildcardSuites=org.apache.spark.sql.jdbc.MariaDBKrbIntegrationSuite test $ docker container ls ``` I also confirmed that there are no regression for all the subclasses of `DockerJDBCIntegrationSuite` with sbt/Maven. * MariaDBKrbIntegrationSuite * DB2KrbIntegrationSuite * PostgresKrbIntegrationSuite * MySQLIntegrationSuite * PostgresIntegrationSuite * DB2IntegrationSuite * MsSqlServerintegrationsuite * OracleIntegrationSuite * v2.MySQLIntegrationSuite * v2.PostgresIntegrationSuite * v2.DB2IntegrationSuite * v2.MsSqlServerIntegrationSuite * v2.OracleIntegrationSuite NOTE: `DB2IntegrationSuite`, `v2.DB2IntegrationSuite` and `DB2KrbIntegrationSuite` can fail due to the too much short connection timeout. It's a separate issue and I'll fix it in #30583 Closes #30601 from sarutak/keepContainer. Authored-by: Kousuke Saruta Signed-off-by: Dongjoon Hyun --- .../sql/jdbc/DockerJDBCIntegrationSuite.scala | 39 ++++++++++++------- pom.xml | 2 + 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala index 00b7b413a964d..d6270313cabea 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala +++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala @@ -25,6 +25,7 @@ import scala.collection.JavaConverters._ import scala.util.control.NonFatal import com.spotify.docker.client._ +import com.spotify.docker.client.DockerClient.ListContainersParam import com.spotify.docker.client.exceptions.ImageNotFoundException import com.spotify.docker.client.messages.{ContainerConfig, HostConfig, PortBinding} import org.scalatest.concurrent.Eventually @@ -95,7 +96,9 @@ abstract class DockerJDBCIntegrationSuite extends SharedSparkSession with Eventu protected val dockerIp = DockerUtils.getDockerIp() val db: DatabaseOnDocker - val connectionTimeout = timeout(2.minutes) + val connectionTimeout = timeout(5.minutes) + val keepContainer = + sys.props.getOrElse("spark.test.docker.keepContainer", "false").toBoolean private var docker: DockerClient = _ // Configure networking (necessary for boot2docker / Docker Machine) @@ -176,20 +179,11 @@ abstract class DockerJDBCIntegrationSuite extends SharedSparkSession with Eventu override def afterAll(): Unit = { try { + cleanupContainer() + } finally { if (docker != null) { - try { - if (containerId != null) { - docker.killContainer(containerId) - docker.removeContainer(containerId) - } - } catch { - case NonFatal(e) => - logWarning(s"Could not stop container $containerId", e) - } finally { - docker.close() - } + docker.close() } - } finally { super.afterAll() } } @@ -205,4 +199,23 @@ abstract class DockerJDBCIntegrationSuite extends SharedSparkSession with Eventu * Prepare databases and tables for testing. */ def dataPreparation(connection: Connection): Unit + + private def cleanupContainer(): Unit = { + if (docker != null && containerId != null && !keepContainer) { + try { + docker.killContainer(containerId) + } catch { + case NonFatal(e) => + val exitContainerIds = + docker.listContainers(ListContainersParam.withStatusExited()).asScala.map(_.id()) + if (exitContainerIds.contains(containerId)) { + logWarning(s"Container $containerId already stopped") + } else { + logWarning(s"Could not stop container $containerId", e) + } + } finally { + docker.removeContainer(containerId) + } + } + } } diff --git a/pom.xml b/pom.xml index 4d6e3bbc95378..80097aec0f429 100644 --- a/pom.xml +++ b/pom.xml @@ -250,6 +250,7 @@ --> ${session.executionRootDirectory} + false 1g @@ -2626,6 +2627,7 @@ false true ${spark.test.webdriver.chrome.driver} + ${spark.test.docker.keepContainer} __not_used__ From 976e8970399a1a0fef4c826d4fdd1a138ca52c77 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Fri, 4 Dec 2020 00:12:04 -0800 Subject: [PATCH 105/150] [SPARK-33640][TESTS] Extend connection timeout to DB server for DB2IntegrationSuite and its variants ### What changes were proposed in this pull request? This PR extends the connection timeout to the DB server for DB2IntegrationSuite and its variants. The container image ibmcom/db2 creates a database when it starts up. The database creation can take over 2 minutes. DB2IntegrationSuite and its variants use the container image but the connection timeout is set to 2 minutes so these suites almost always fail. ### Why are the changes needed? To pass those suites. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? I confirmed the suites pass with the following commands. ``` $ build/sbt -Pdocker-integration-tests -Phive -Phive-thriftserver package "testOnly org.apache.spark.sql.jdbc.DB2IntegrationSuite" $ build/sbt -Pdocker-integration-tests -Phive -Phive-thriftserver package "testOnly org.apache.spark.sql.jdbc.v2.DB2IntegrationSuite" $ build/sbt -Pdocker-integration-tests -Phive -Phive-thriftserver package "testOnly org.apache.spark.sql.jdbc.DB2KrbIntegrationSuite" Closes #30583 from sarutak/extend-timeout-for-db2. Authored-by: Kousuke Saruta Signed-off-by: Dongjoon Hyun --- .../scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala | 4 ++++ .../org/apache/spark/sql/jdbc/DB2KrbIntegrationSuite.scala | 3 +++ .../org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala | 4 ++++ 3 files changed, 11 insertions(+) diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala index d086c8cdcc589..49ca91c50d25e 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala +++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala @@ -21,6 +21,8 @@ import java.math.BigDecimal import java.sql.{Connection, Date, Timestamp} import java.util.Properties +import org.scalatest.time.SpanSugar._ + import org.apache.spark.sql.Row import org.apache.spark.sql.types.{BooleanType, ByteType, ShortType, StructType} import org.apache.spark.tags.DockerTest @@ -51,6 +53,8 @@ class DB2IntegrationSuite extends DockerJDBCIntegrationSuite { s"jdbc:db2://$ip:$port/foo:user=db2inst1;password=rootpass;retrieveMessagesFromServerOnGetMessage=true;" //scalastyle:ignore } + override val connectionTimeout = timeout(3.minutes) + override def dataPreparation(conn: Connection): Unit = { conn.prepareStatement("CREATE TABLE tbl (x INTEGER, y VARCHAR(8))").executeUpdate() conn.prepareStatement("INSERT INTO tbl VALUES (42,'fred')").executeUpdate() diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2KrbIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2KrbIntegrationSuite.scala index 9c3a609b98bbe..5cbe6fab186a5 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2KrbIntegrationSuite.scala +++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2KrbIntegrationSuite.scala @@ -24,6 +24,7 @@ import javax.security.auth.login.Configuration import com.spotify.docker.client.messages.{ContainerConfig, HostConfig} import org.apache.hadoop.security.{SecurityUtil, UserGroupInformation} import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod.KERBEROS +import org.scalatest.time.SpanSugar._ import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions import org.apache.spark.sql.execution.datasources.jdbc.connection.{DB2ConnectionProvider, SecureConnectionProvider} @@ -76,6 +77,8 @@ class DB2KrbIntegrationSuite extends DockerKrbJDBCIntegrationSuite { } } + override val connectionTimeout = timeout(3.minutes) + override protected def setAuthentication(keytabFile: String, principal: String): Unit = { val config = new SecureConnectionProvider.JDBCConfiguration( Configuration.getConfiguration, "JaasClient", keytabFile, principal) diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala index 6f803b8f61dd4..8cabf353c6fef 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala +++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.jdbc.v2 import java.sql.Connection +import org.scalatest.time.SpanSugar._ + import org.apache.spark.SparkConf import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog @@ -52,6 +54,8 @@ class DB2IntegrationSuite extends DockerJDBCIntegrationSuite with V2JDBCTest { s"jdbc:db2://$ip:$port/foo:user=db2inst1;password=rootpass;retrieveMessagesFromServerOnGetMessage=true;" //scalastyle:ignore } + override val connectionTimeout = timeout(3.minutes) + override def sparkConf: SparkConf = super.sparkConf .set("spark.sql.catalog.db2", classOf[JDBCTableCatalog].getName) .set("spark.sql.catalog.db2.url", db.getJdbcUrl(dockerIp, externalPort)) From 233a8494c8cc7bc8a4a9393ec512943749f11bef Mon Sep 17 00:00:00 2001 From: "Jungtaek Lim (HeartSaVioR)" Date: Fri, 4 Dec 2020 19:33:11 +0900 Subject: [PATCH 106/150] [SPARK-27237][SS] Introduce State schema validation among query restart ## What changes were proposed in this pull request? Please refer the description of [SPARK-27237](https://issues.apache.org/jira/browse/SPARK-27237) to see rationalization of this patch. This patch proposes to introduce state schema validation, via storing key schema and value schema to `schema` file (for the first time) and verify new key schema and value schema for state are compatible with existing one. To be clear for definition of "compatible", state schema is "compatible" when number of fields are same and data type for each field is same - Spark has been allowing rename of field. This patch will prevent query run which has incompatible state schema, which would reduce the chance to get indeterministic behavior (actually renaming of field is also the smell of semantically incompatible, but end users could just modify its name so we can't say) as well as providing more informative error message. ## How was this patch tested? Added UTs. Closes #24173 from HeartSaVioR/SPARK-27237. Lead-authored-by: Jungtaek Lim (HeartSaVioR) Co-authored-by: Jungtaek Lim (HeartSaVioR) Signed-off-by: HyukjinKwon --- .../apache/spark/sql/internal/SQLConf.scala | 10 + .../org/apache/spark/sql/types/DataType.scala | 38 ++- .../execution/streaming/HDFSMetadataLog.scala | 32 +-- .../streaming/MetadataVersionUtil.scala | 51 ++++ .../StateSchemaCompatibilityChecker.scala | 118 +++++++++ .../streaming/state/StateStore.scala | 36 ++- .../streaming/state/StateStoreConf.scala | 3 + ...StateSchemaCompatibilityCheckerSuite.scala | 230 ++++++++++++++++++ .../streaming/StreamingAggregationSuite.scala | 87 ++++++- ...ngStateStoreFormatCompatibilitySuite.scala | 21 +- 10 files changed, 582 insertions(+), 44 deletions(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataVersionUtil.scala create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityCheckerSuite.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 4442581b77811..025478214e492 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1294,6 +1294,14 @@ object SQLConf { .createWithDefault( "org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider") + val STATE_SCHEMA_CHECK_ENABLED = + buildConf("spark.sql.streaming.stateStore.stateSchemaCheck") + .doc("When true, Spark will validate the state schema against schema on existing state and " + + "fail query if it's incompatible.") + .version("3.1.0") + .booleanConf + .createWithDefault(true) + val STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT = buildConf("spark.sql.streaming.stateStore.minDeltasForSnapshot") .internal() @@ -3079,6 +3087,8 @@ class SQLConf extends Serializable with Logging { def stateStoreProviderClass: String = getConf(STATE_STORE_PROVIDER_CLASS) + def isStateSchemaCheckEnabled: Boolean = getConf(STATE_SCHEMA_CHECK_ENABLED) + def stateStoreMinDeltasForSnapshot: Int = getConf(STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT) def stateStoreFormatValidationEnabled: Boolean = getConf(STATE_STORE_FORMAT_VALIDATION_ENABLED) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala index e4ee6eb377a4d..9e820f0796a96 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala @@ -307,21 +307,49 @@ object DataType { * of `fromField.nullable` and `toField.nullable` are false. */ private[sql] def equalsIgnoreCompatibleNullability(from: DataType, to: DataType): Boolean = { + equalsIgnoreCompatibleNullability(from, to, ignoreName = false) + } + + /** + * Compares two types, ignoring compatible nullability of ArrayType, MapType, StructType, and + * also the field name. It compares based on the position. + * + * Compatible nullability is defined as follows: + * - If `from` and `to` are ArrayTypes, `from` has a compatible nullability with `to` + * if and only if `to.containsNull` is true, or both of `from.containsNull` and + * `to.containsNull` are false. + * - If `from` and `to` are MapTypes, `from` has a compatible nullability with `to` + * if and only if `to.valueContainsNull` is true, or both of `from.valueContainsNull` and + * `to.valueContainsNull` are false. + * - If `from` and `to` are StructTypes, `from` has a compatible nullability with `to` + * if and only if for all every pair of fields, `to.nullable` is true, or both + * of `fromField.nullable` and `toField.nullable` are false. + */ + private[sql] def equalsIgnoreNameAndCompatibleNullability( + from: DataType, + to: DataType): Boolean = { + equalsIgnoreCompatibleNullability(from, to, ignoreName = true) + } + + private def equalsIgnoreCompatibleNullability( + from: DataType, + to: DataType, + ignoreName: Boolean = false): Boolean = { (from, to) match { case (ArrayType(fromElement, fn), ArrayType(toElement, tn)) => - (tn || !fn) && equalsIgnoreCompatibleNullability(fromElement, toElement) + (tn || !fn) && equalsIgnoreCompatibleNullability(fromElement, toElement, ignoreName) case (MapType(fromKey, fromValue, fn), MapType(toKey, toValue, tn)) => (tn || !fn) && - equalsIgnoreCompatibleNullability(fromKey, toKey) && - equalsIgnoreCompatibleNullability(fromValue, toValue) + equalsIgnoreCompatibleNullability(fromKey, toKey, ignoreName) && + equalsIgnoreCompatibleNullability(fromValue, toValue, ignoreName) case (StructType(fromFields), StructType(toFields)) => fromFields.length == toFields.length && fromFields.zip(toFields).forall { case (fromField, toField) => - fromField.name == toField.name && + (ignoreName || fromField.name == toField.name) && (toField.nullable || !fromField.nullable) && - equalsIgnoreCompatibleNullability(fromField.dataType, toField.dataType) + equalsIgnoreCompatibleNullability(fromField.dataType, toField.dataType, ignoreName) } case (fromDataType, toDataType) => fromDataType == toDataType diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala index 893639a86c88c..b87a5b49eb6ea 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala @@ -267,36 +267,8 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path: } } - /** - * Parse the log version from the given `text` -- will throw exception when the parsed version - * exceeds `maxSupportedVersion`, or when `text` is malformed (such as "xyz", "v", "v-1", - * "v123xyz" etc.) - */ - private[sql] def validateVersion(text: String, maxSupportedVersion: Int): Int = { - if (text.length > 0 && text(0) == 'v') { - val version = - try { - text.substring(1, text.length).toInt - } catch { - case _: NumberFormatException => - throw new IllegalStateException(s"Log file was malformed: failed to read correct log " + - s"version from $text.") - } - if (version > 0) { - if (version > maxSupportedVersion) { - throw new IllegalStateException(s"UnsupportedLogVersion: maximum supported log version " + - s"is v${maxSupportedVersion}, but encountered v$version. The log file was produced " + - s"by a newer version of Spark and cannot be read by this version. Please upgrade.") - } else { - return version - } - } - } - - // reaching here means we failed to read the correct log version - throw new IllegalStateException(s"Log file was malformed: failed to read correct log " + - s"version from $text.") - } + private[sql] def validateVersion(text: String, maxSupportedVersion: Int): Int = + MetadataVersionUtil.validateVersion(text, maxSupportedVersion) } object HDFSMetadataLog { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataVersionUtil.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataVersionUtil.scala new file mode 100644 index 0000000000000..548f2aa5d5c5b --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataVersionUtil.scala @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.streaming + +object MetadataVersionUtil { + /** + * Parse the log version from the given `text` -- will throw exception when the parsed version + * exceeds `maxSupportedVersion`, or when `text` is malformed (such as "xyz", "v", "v-1", + * "v123xyz" etc.) + */ + def validateVersion(text: String, maxSupportedVersion: Int): Int = { + if (text.length > 0 && text(0) == 'v') { + val version = + try { + text.substring(1, text.length).toInt + } catch { + case _: NumberFormatException => + throw new IllegalStateException(s"Log file was malformed: failed to read correct log " + + s"version from $text.") + } + if (version > 0) { + if (version > maxSupportedVersion) { + throw new IllegalStateException(s"UnsupportedLogVersion: maximum supported log version " + + s"is v${maxSupportedVersion}, but encountered v$version. The log file was produced " + + s"by a newer version of Spark and cannot be read by this version. Please upgrade.") + } else { + return version + } + } + } + + // reaching here means we failed to read the correct log version + throw new IllegalStateException(s"Log file was malformed: failed to read correct log " + + s"version from $text.") + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala new file mode 100644 index 0000000000000..4ac12c089c0d3 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.streaming.state + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.execution.streaming.{CheckpointFileManager, MetadataVersionUtil} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{DataType, StructType} + +case class StateSchemaNotCompatible(message: String) extends Exception(message) + +class StateSchemaCompatibilityChecker( + providerId: StateStoreProviderId, + hadoopConf: Configuration) extends Logging { + + private val storeCpLocation = providerId.storeId.storeCheckpointLocation() + private val fm = CheckpointFileManager.create(storeCpLocation, hadoopConf) + private val schemaFileLocation = schemaFile(storeCpLocation) + + fm.mkdirs(schemaFileLocation.getParent) + + def check(keySchema: StructType, valueSchema: StructType): Unit = { + if (fm.exists(schemaFileLocation)) { + logDebug(s"Schema file for provider $providerId exists. Comparing with provided schema.") + val (storedKeySchema, storedValueSchema) = readSchemaFile() + if (storedKeySchema.equals(keySchema) && storedValueSchema.equals(valueSchema)) { + // schema is exactly same + } else if (!schemasCompatible(storedKeySchema, keySchema) || + !schemasCompatible(storedValueSchema, valueSchema)) { + val errorMsg = "Provided schema doesn't match to the schema for existing state! " + + "Please note that Spark allow difference of field name: check count of fields " + + "and data type of each field.\n" + + s"- Provided key schema: $keySchema\n" + + s"- Provided value schema: $valueSchema\n" + + s"- Existing key schema: $storedKeySchema\n" + + s"- Existing value schema: $storedValueSchema\n" + + s"If you want to force running query without schema validation, please set " + + s"${SQLConf.STATE_SCHEMA_CHECK_ENABLED.key} to false.\n" + + "Please note running query with incompatible schema could cause indeterministic" + + " behavior." + logError(errorMsg) + throw StateSchemaNotCompatible(errorMsg) + } else { + logInfo("Detected schema change which is compatible. Allowing to put rows.") + } + } else { + // schema doesn't exist, create one now + logDebug(s"Schema file for provider $providerId doesn't exist. Creating one.") + createSchemaFile(keySchema, valueSchema) + } + } + + private def schemasCompatible(storedSchema: StructType, schema: StructType): Boolean = + DataType.equalsIgnoreNameAndCompatibleNullability(storedSchema, schema) + + private def readSchemaFile(): (StructType, StructType) = { + val inStream = fm.open(schemaFileLocation) + try { + val versionStr = inStream.readUTF() + // Currently we only support version 1, which we can simplify the version validation and + // the parse logic. + val version = MetadataVersionUtil.validateVersion(versionStr, + StateSchemaCompatibilityChecker.VERSION) + require(version == 1) + + val keySchemaStr = inStream.readUTF() + val valueSchemaStr = inStream.readUTF() + + (StructType.fromString(keySchemaStr), StructType.fromString(valueSchemaStr)) + } catch { + case e: Throwable => + logError(s"Fail to read schema file from $schemaFileLocation", e) + throw e + } finally { + inStream.close() + } + } + + private def createSchemaFile(keySchema: StructType, valueSchema: StructType): Unit = { + val outStream = fm.createAtomic(schemaFileLocation, overwriteIfPossible = false) + try { + outStream.writeUTF(s"v${StateSchemaCompatibilityChecker.VERSION}") + outStream.writeUTF(keySchema.json) + outStream.writeUTF(valueSchema.json) + outStream.close() + } catch { + case e: Throwable => + logError(s"Fail to write schema file to $schemaFileLocation", e) + outStream.cancel() + throw e + } + } + + private def schemaFile(storeCpLocation: Path): Path = + new Path(new Path(storeCpLocation, "_metadata"), "schema") +} + +object StateSchemaCompatibilityChecker { + val VERSION = 1 +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala index 05bcee7b05c6f..ab67c19783ff7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala @@ -22,6 +22,7 @@ import java.util.concurrent.{ScheduledFuture, TimeUnit} import javax.annotation.concurrent.GuardedBy import scala.collection.mutable +import scala.util.Try import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration @@ -280,14 +281,14 @@ object StateStoreProvider { * Return a instance of the required provider, initialized with the given configurations. */ def createAndInit( - stateStoreId: StateStoreId, + providerId: StateStoreProviderId, keySchema: StructType, valueSchema: StructType, indexOrdinal: Option[Int], // for sorting the data storeConf: StateStoreConf, hadoopConf: Configuration): StateStoreProvider = { val provider = create(storeConf.providerClass) - provider.init(stateStoreId, keySchema, valueSchema, indexOrdinal, storeConf, hadoopConf) + provider.init(providerId.storeId, keySchema, valueSchema, indexOrdinal, storeConf, hadoopConf) provider } @@ -386,10 +387,14 @@ object StateStore extends Logging { val MAINTENANCE_INTERVAL_CONFIG = "spark.sql.streaming.stateStore.maintenanceInterval" val MAINTENANCE_INTERVAL_DEFAULT_SECS = 60 + val PARTITION_ID_TO_CHECK_SCHEMA = 0 @GuardedBy("loadedProviders") private val loadedProviders = new mutable.HashMap[StateStoreProviderId, StateStoreProvider]() + @GuardedBy("loadedProviders") + private val schemaValidated = new mutable.HashMap[StateStoreProviderId, Option[Throwable]]() + /** * Runs the `task` periodically and automatically cancels it if there is an exception. `onError` * will be called when an exception happens. @@ -467,10 +472,29 @@ object StateStore extends Logging { hadoopConf: Configuration): StateStoreProvider = { loadedProviders.synchronized { startMaintenanceIfNeeded() + + if (storeProviderId.storeId.partitionId == PARTITION_ID_TO_CHECK_SCHEMA) { + val result = schemaValidated.getOrElseUpdate(storeProviderId, { + val checker = new StateSchemaCompatibilityChecker(storeProviderId, hadoopConf) + // regardless of configuration, we check compatibility to at least write schema file + // if necessary + val ret = Try(checker.check(keySchema, valueSchema)).toEither.fold(Some(_), _ => None) + if (storeConf.stateSchemaCheckEnabled) { + ret + } else { + None + } + }) + + if (result.isDefined) { + throw result.get + } + } + val provider = loadedProviders.getOrElseUpdate( storeProviderId, StateStoreProvider.createAndInit( - storeProviderId.storeId, keySchema, valueSchema, indexOrdinal, storeConf, hadoopConf) + storeProviderId, keySchema, valueSchema, indexOrdinal, storeConf, hadoopConf) ) reportActiveStoreInstance(storeProviderId) provider @@ -482,6 +506,12 @@ object StateStore extends Logging { loadedProviders.remove(storeProviderId).foreach(_.close()) } + /** Unload all state store providers: unit test purpose */ + private[sql] def unloadAll(): Unit = loadedProviders.synchronized { + loadedProviders.keySet.foreach { key => unload(key) } + loadedProviders.clear() + } + /** Whether a state store provider is loaded or not */ def isLoaded(storeProviderId: StateStoreProviderId): Boolean = loadedProviders.synchronized { loadedProviders.contains(storeProviderId) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala index 11043bc81ae3f..23cb3be32c85a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala @@ -55,6 +55,9 @@ class StateStoreConf( /** The compression codec used to compress delta and snapshot files. */ val compressionCodec: String = sqlConf.stateStoreCompressionCodec + /** whether to validate state schema during query run. */ + val stateSchemaCheckEnabled = sqlConf.isStateSchemaCheckEnabled + /** * Additional configurations related to state store. This will capture all configs in * SQLConf that start with `spark.sql.streaming.stateStore.` and extraOptions for a specific diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityCheckerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityCheckerSuite.scala new file mode 100644 index 0000000000000..4eb7603b316aa --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityCheckerSuite.scala @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.streaming.state + +import java.util.UUID + +import scala.util.Random + +import org.apache.hadoop.conf.Configuration + +import org.apache.spark.sql.execution.streaming.state.StateStoreTestsHelper.newDir +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types._ + +class StateSchemaCompatibilityCheckerSuite extends SharedSparkSession { + + private val hadoopConf: Configuration = new Configuration() + private val opId = Random.nextInt(100000) + private val partitionId = StateStore.PARTITION_ID_TO_CHECK_SCHEMA + + private val structSchema = new StructType() + .add(StructField("nested1", IntegerType, nullable = true)) + .add(StructField("nested2", StringType, nullable = true)) + + private val keySchema = new StructType() + .add(StructField("key1", IntegerType, nullable = true)) + .add(StructField("key2", StringType, nullable = true)) + .add(StructField("key3", structSchema, nullable = true)) + + private val valueSchema = new StructType() + .add(StructField("value1", IntegerType, nullable = true)) + .add(StructField("value2", StringType, nullable = true)) + .add(StructField("value3", structSchema, nullable = true)) + + test("adding field to key should fail") { + val fieldAddedKeySchema = keySchema.add(StructField("newKey", IntegerType)) + verifyException(keySchema, valueSchema, fieldAddedKeySchema, valueSchema) + } + + test("adding field to value should fail") { + val fieldAddedValueSchema = valueSchema.add(StructField("newValue", IntegerType)) + verifyException(keySchema, valueSchema, keySchema, fieldAddedValueSchema) + } + + test("adding nested field in key should fail") { + val fieldAddedNestedSchema = structSchema.add(StructField("newNested", IntegerType)) + val newKeySchema = applyNewSchemaToNestedFieldInKey(fieldAddedNestedSchema) + verifyException(keySchema, valueSchema, newKeySchema, valueSchema) + } + + test("adding nested field in value should fail") { + val fieldAddedNestedSchema = structSchema.add(StructField("newNested", IntegerType)) + val newValueSchema = applyNewSchemaToNestedFieldInValue(fieldAddedNestedSchema) + verifyException(keySchema, valueSchema, keySchema, newValueSchema) + } + + test("removing field from key should fail") { + val fieldRemovedKeySchema = StructType(keySchema.dropRight(1)) + verifyException(keySchema, valueSchema, fieldRemovedKeySchema, valueSchema) + } + + test("removing field from value should fail") { + val fieldRemovedValueSchema = StructType(valueSchema.drop(1)) + verifyException(keySchema, valueSchema, keySchema, fieldRemovedValueSchema) + } + + test("removing nested field from key should fail") { + val fieldRemovedNestedSchema = StructType(structSchema.dropRight(1)) + val newKeySchema = applyNewSchemaToNestedFieldInKey(fieldRemovedNestedSchema) + verifyException(keySchema, valueSchema, newKeySchema, valueSchema) + } + + test("removing nested field from value should fail") { + val fieldRemovedNestedSchema = StructType(structSchema.drop(1)) + val newValueSchema = applyNewSchemaToNestedFieldInValue(fieldRemovedNestedSchema) + verifyException(keySchema, valueSchema, keySchema, newValueSchema) + } + + test("changing the type of field in key should fail") { + val typeChangedKeySchema = StructType(keySchema.map(_.copy(dataType = TimestampType))) + verifyException(keySchema, valueSchema, typeChangedKeySchema, valueSchema) + } + + test("changing the type of field in value should fail") { + val typeChangedValueSchema = StructType(valueSchema.map(_.copy(dataType = TimestampType))) + verifyException(keySchema, valueSchema, keySchema, typeChangedValueSchema) + } + + test("changing the type of nested field in key should fail") { + val typeChangedNestedSchema = StructType(structSchema.map(_.copy(dataType = TimestampType))) + val newKeySchema = applyNewSchemaToNestedFieldInKey(typeChangedNestedSchema) + verifyException(keySchema, valueSchema, newKeySchema, valueSchema) + } + + test("changing the type of nested field in value should fail") { + val typeChangedNestedSchema = StructType(structSchema.map(_.copy(dataType = TimestampType))) + val newValueSchema = applyNewSchemaToNestedFieldInValue(typeChangedNestedSchema) + verifyException(keySchema, valueSchema, keySchema, newValueSchema) + } + + test("changing the nullability of nullable to non-nullable in key should fail") { + val nonNullChangedKeySchema = StructType(keySchema.map(_.copy(nullable = false))) + verifyException(keySchema, valueSchema, nonNullChangedKeySchema, valueSchema) + } + + test("changing the nullability of nullable to non-nullable in value should fail") { + val nonNullChangedValueSchema = StructType(valueSchema.map(_.copy(nullable = false))) + verifyException(keySchema, valueSchema, keySchema, nonNullChangedValueSchema) + } + + test("changing the nullability of nullable to nonnullable in nested field in key should fail") { + val typeChangedNestedSchema = StructType(structSchema.map(_.copy(nullable = false))) + val newKeySchema = applyNewSchemaToNestedFieldInKey(typeChangedNestedSchema) + verifyException(keySchema, valueSchema, newKeySchema, valueSchema) + } + + test("changing the nullability of nullable to nonnullable in nested field in value should fail") { + val typeChangedNestedSchema = StructType(structSchema.map(_.copy(nullable = false))) + val newValueSchema = applyNewSchemaToNestedFieldInValue(typeChangedNestedSchema) + verifyException(keySchema, valueSchema, keySchema, newValueSchema) + } + + test("changing the name of field in key should be allowed") { + val newName: StructField => StructField = f => f.copy(name = f.name + "_new") + val fieldNameChangedKeySchema = StructType(keySchema.map(newName)) + verifySuccess(keySchema, valueSchema, fieldNameChangedKeySchema, valueSchema) + } + + test("changing the name of field in value should be allowed") { + val newName: StructField => StructField = f => f.copy(name = f.name + "_new") + val fieldNameChangedValueSchema = StructType(valueSchema.map(newName)) + verifySuccess(keySchema, valueSchema, keySchema, fieldNameChangedValueSchema) + } + + test("changing the name of nested field in key should be allowed") { + val newName: StructField => StructField = f => f.copy(name = f.name + "_new") + val newNestedFieldsSchema = StructType(structSchema.map(newName)) + val fieldNameChangedKeySchema = applyNewSchemaToNestedFieldInKey(newNestedFieldsSchema) + verifySuccess(keySchema, valueSchema, fieldNameChangedKeySchema, valueSchema) + } + + test("changing the name of nested field in value should be allowed") { + val newName: StructField => StructField = f => f.copy(name = f.name + "_new") + val newNestedFieldsSchema = StructType(structSchema.map(newName)) + val fieldNameChangedValueSchema = applyNewSchemaToNestedFieldInValue(newNestedFieldsSchema) + verifySuccess(keySchema, valueSchema, keySchema, fieldNameChangedValueSchema) + } + + private def applyNewSchemaToNestedFieldInKey(newNestedSchema: StructType): StructType = { + applyNewSchemaToNestedField(keySchema, newNestedSchema, "key3") + } + + private def applyNewSchemaToNestedFieldInValue(newNestedSchema: StructType): StructType = { + applyNewSchemaToNestedField(valueSchema, newNestedSchema, "value3") + } + + private def applyNewSchemaToNestedField( + originSchema: StructType, + newNestedSchema: StructType, + fieldName: String): StructType = { + val newFields = originSchema.map { field => + if (field.name == fieldName) { + field.copy(dataType = newNestedSchema) + } else { + field + } + } + StructType(newFields) + } + + private def runSchemaChecker( + dir: String, + queryId: UUID, + newKeySchema: StructType, + newValueSchema: StructType): Unit = { + // in fact, Spark doesn't support online state schema change, so need to check + // schema only once for each running of JVM + val providerId = StateStoreProviderId( + StateStoreId(dir, opId, partitionId), queryId) + + new StateSchemaCompatibilityChecker(providerId, hadoopConf) + .check(newKeySchema, newValueSchema) + } + + private def verifyException( + oldKeySchema: StructType, + oldValueSchema: StructType, + newKeySchema: StructType, + newValueSchema: StructType): Unit = { + val dir = newDir() + val queryId = UUID.randomUUID() + runSchemaChecker(dir, queryId, oldKeySchema, oldValueSchema) + + val e = intercept[StateSchemaNotCompatible] { + runSchemaChecker(dir, queryId, newKeySchema, newValueSchema) + } + + e.getMessage.contains("Provided schema doesn't match to the schema for existing state!") + e.getMessage.contains(newKeySchema.json) + e.getMessage.contains(newValueSchema.json) + e.getMessage.contains(oldKeySchema.json) + e.getMessage.contains(oldValueSchema.json) + } + + private def verifySuccess( + oldKeySchema: StructType, + oldValueSchema: StructType, + newKeySchema: StructType, + newValueSchema: StructType): Unit = { + val dir = newDir() + val queryId = UUID.randomUUID() + runSchemaChecker(dir, queryId, oldKeySchema, oldValueSchema) + runSchemaChecker(dir, queryId, newKeySchema, newValueSchema) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala index 0524e29662014..491b0d8b2c26c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala @@ -20,6 +20,8 @@ package org.apache.spark.sql.streaming import java.io.File import java.util.{Locale, TimeZone} +import scala.annotation.tailrec + import org.apache.commons.io.FileUtils import org.scalatest.Assertions @@ -33,7 +35,7 @@ import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.exchange.Exchange import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.sources.MemorySink -import org.apache.spark.sql.execution.streaming.state.StreamingAggregationStateManager +import org.apache.spark.sql.execution.streaming.state.{StateSchemaNotCompatible, StateStore, StreamingAggregationStateManager} import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.OutputMode._ @@ -753,6 +755,89 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with Assertions { ) } + testQuietlyWithAllStateVersions("changing schema of state when restarting query", + (SQLConf.STATE_STORE_FORMAT_VALIDATION_ENABLED.key, "false")) { + withTempDir { tempDir => + val (inputData, aggregated) = prepareTestForChangingSchemaOfState(tempDir) + + // if we don't have verification phase on state schema, modified query would throw NPE with + // stack trace which end users would not easily understand + + testStream(aggregated, Update())( + StartStream(checkpointLocation = tempDir.getAbsolutePath), + AddData(inputData, 21), + ExpectFailure[SparkException] { e => + val stateSchemaExc = findStateSchemaNotCompatible(e) + assert(stateSchemaExc.isDefined) + val msg = stateSchemaExc.get.getMessage + assert(msg.contains("Provided schema doesn't match to the schema for existing state")) + // other verifications are presented in StateStoreSuite + } + ) + } + } + + testQuietlyWithAllStateVersions("changing schema of state when restarting query -" + + " schema check off", + (SQLConf.STATE_SCHEMA_CHECK_ENABLED.key, "false"), + (SQLConf.STATE_STORE_FORMAT_VALIDATION_ENABLED.key, "false")) { + withTempDir { tempDir => + val (inputData, aggregated) = prepareTestForChangingSchemaOfState(tempDir) + + testStream(aggregated, Update())( + StartStream(checkpointLocation = tempDir.getAbsolutePath), + AddData(inputData, 21), + ExpectFailure[SparkException] { e => + val stateSchemaExc = findStateSchemaNotCompatible(e) + // it would bring other error in runtime, but it shouldn't check schema in any way + assert(stateSchemaExc.isEmpty) + } + ) + } + } + + private def prepareTestForChangingSchemaOfState( + tempDir: File): (MemoryStream[Int], DataFrame) = { + val inputData = MemoryStream[Int] + val aggregated = inputData.toDF() + .selectExpr("value % 10 AS id", "value") + .groupBy($"id") + .agg( + sum("value").as("sum_value"), + avg("value").as("avg_value"), + max("value").as("max_value")) + + testStream(aggregated, Update())( + StartStream(checkpointLocation = tempDir.getAbsolutePath), + AddData(inputData, 1, 11), + CheckLastBatch((1L, 12L, 6.0, 11)), + StopStream + ) + + StateStore.unloadAll() + + val inputData2 = MemoryStream[Int] + val aggregated2 = inputData2.toDF() + .selectExpr("value % 10 AS id", "value") + .groupBy($"id") + .agg( + sum("value").as("sum_value"), + avg("value").as("avg_value"), + collect_list("value").as("values")) + + inputData2.addData(1, 11) + + (inputData2, aggregated2) + } + + @tailrec + private def findStateSchemaNotCompatible(exc: Throwable): Option[StateSchemaNotCompatible] = { + exc match { + case e1: StateSchemaNotCompatible => Some(e1) + case e1 if e1.getCause != null => findStateSchemaNotCompatible(e1.getCause) + case _ => None + } + } /** Add blocks of data to the `BlockRDDBackedSource`. */ case class AddBlockData(source: BlockRDDBackedSource, data: Seq[Int]*) extends AddData { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingStateStoreFormatCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingStateStoreFormatCompatibilitySuite.scala index 33f6b02acb6dd..1032d6c5b6ff2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingStateStoreFormatCompatibilitySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingStateStoreFormatCompatibilitySuite.scala @@ -19,12 +19,15 @@ package org.apache.spark.sql.streaming import java.io.File +import scala.annotation.tailrec + import org.apache.commons.io.FileUtils import org.apache.spark.SparkException import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.streaming.InternalOutputModes.Complete import org.apache.spark.sql.execution.streaming.MemoryStream +import org.apache.spark.sql.execution.streaming.state.{InvalidUnsafeRowException, StateSchemaNotCompatible} import org.apache.spark.sql.functions._ import org.apache.spark.util.Utils @@ -239,11 +242,19 @@ class StreamingStateStoreFormatCompatibilitySuite extends StreamTest { CheckAnswer(Row(0, 20, Seq(0, 2, 4, 6, 8)), Row(1, 25, Seq(1, 3, 5, 7, 9))) */ AddData(inputData, 10 to 19: _*), - ExpectFailure[SparkException](e => { - // Check the exception message to make sure the state store format changing. - assert(e.getCause.getCause.getMessage.contains( - "The streaming query failed by state format invalidation.")) - }) + ExpectFailure[SparkException] { e => + assert(findStateSchemaException(e)) + } ) } + + @tailrec + private def findStateSchemaException(exc: Throwable): Boolean = { + exc match { + case _: StateSchemaNotCompatible => true + case _: InvalidUnsafeRowException => true + case e1 if e1.getCause != null => findStateSchemaException(e1.getCause) + case _ => false + } + } } From 990bee9c58ea9abd8c4f04f20c78c6d5b720406a Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Fri, 4 Dec 2020 19:37:03 +0900 Subject: [PATCH 107/150] [SPARK-33615][K8S] Make 'spark.archives' working in Kubernates ### What changes were proposed in this pull request? This PR proposes to make `spark.archives` configuration working in Kubernates. It works without a problem in standalone cluster but there seems a bug in Kubernates. It fails to fetch the file on the driver side as below: ``` 20/12/03 13:33:53 INFO SparkContext: Added JAR file:/tmp/spark-75004286-c83a-4369-b624-14c5d2d2a748/spark-examples_2.12-3.1.0-SNAPSHOT.jar at spark://spark-test-app-48ae737628cee6f8-driver-svc.spark-integration-test.svc:7078/jars/spark-examples_2.12-3.1.0-SNAPSHOT.jar with timestamp 1607002432558 20/12/03 13:33:53 INFO SparkContext: Added archive file:///tmp/tmp4542734800151332666.txt.tar.gz#test_tar_gz at spark://spark-test-app-48ae737628cee6f8-driver-svc.spark-integration-test.svc:7078/files/tmp4542734800151332666.txt.tar.gz with timestamp 1607002432558 20/12/03 13:33:53 INFO TransportClientFactory: Successfully created connection to spark-test-app-48ae737628cee6f8-driver-svc.spark-integration-test.svc/172.17.0.4:7078 after 83 ms (47 ms spent in bootstraps) 20/12/03 13:33:53 INFO Utils: Fetching spark://spark-test-app-48ae737628cee6f8-driver-svc.spark-integration-test.svc:7078/files/tmp4542734800151332666.txt.tar.gz to /tmp/spark-66573e24-27a3-427c-99f4-36f06d9e9cd5/fetchFileTemp2665785666227461849.tmp 20/12/03 13:33:53 ERROR SparkContext: Error initializing SparkContext. java.lang.RuntimeException: Stream '/files/tmp4542734800151332666.txt.tar.gz' was not found. at org.apache.spark.network.client.TransportResponseHandler.handle(TransportResponseHandler.java:242) at org.apache.spark.network.server.TransportChannelHandler.channelRead0(TransportChannelHandler.java:142) at org.apache.spark.network.server.TransportChannelHandler.channelRead0(TransportChannelHandler.java:53) ``` This is because `spark.archives` was not actually added on the driver side correctly. The changes here fix it by adding and resolving URIs correctly. ### Why are the changes needed? `spark.archives` feature can be leveraged for many things such as Conda support. We should make it working in Kubernates as well. This is a bug fix too. ### Does this PR introduce _any_ user-facing change? No, this feature is not out yet. ### How was this patch tested? I manually tested with Minikube 1.15.1. For an environment issue (?), I had to use a custom namespace, service account and roles. `default` service account does not work for me and complains it doesn't have permissions to get/list pods, etc. ```bash minikube delete minikube start --cpus 12 --memory 16384 kubectl create namespace spark-integration-test cat < Signed-off-by: HyukjinKwon --- .../scala/org/apache/spark/SparkContext.scala | 4 +++- .../org/apache/spark/deploy/SparkSubmit.scala | 13 +++++++++-- docs/running-on-kubernetes.md | 2 +- .../k8s/features/BasicDriverFeatureStep.scala | 22 ++++++++++++++++--- .../k8s/integrationtest/DepsTestsSuite.scala | 12 ++++++++++ .../deploy/k8s/integrationtest/Utils.scala | 22 +++++++++++++++++++ .../org/apache/spark/deploy/yarn/Client.scala | 1 + 7 files changed, 69 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 86f1d745d91d4..17ceb5f1887c6 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -1639,7 +1639,9 @@ class SparkContext(config: SparkConf) extends Logging { UriBuilder.fromUri(new URI(key)).fragment(uri.getFragment).build().toString, timestamp).isEmpty) { logInfo(s"Added archive $path at $key with timestamp $timestamp") - val uriToDownload = UriBuilder.fromUri(new URI(key)).fragment(null).build() + // If the scheme is file, use URI to simply copy instead of downloading. + val uriToUse = if (!isLocal && scheme == "file") uri else new URI(key) + val uriToDownload = UriBuilder.fromUri(uriToUse).fragment(null).build() val source = Utils.fetchFile(uriToDownload.toString, Utils.createTempDir(), conf, env.securityManager, hadoopConfiguration, timestamp, useCache = false, shouldUntar = false) val dest = new File( diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index a344bce7a0f3c..ea293f03a2169 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -24,6 +24,7 @@ import java.security.PrivilegedExceptionAction import java.text.ParseException import java.util.{ServiceLoader, UUID} import java.util.jar.JarInputStream +import javax.ws.rs.core.UriBuilder import scala.annotation.tailrec import scala.collection.JavaConverters._ @@ -387,10 +388,18 @@ private[spark] class SparkSubmit extends Logging { // Executors will get the jars from the Spark file server. // Explicitly download the related files here args.jars = renameResourcesToLocalFS(args.jars, localJars) - val localFiles = Option(args.files).map { + val filesLocalFiles = Option(args.files).map { downloadFileList(_, targetDir, sparkConf, hadoopConf, secMgr) }.orNull - args.files = renameResourcesToLocalFS(args.files, localFiles) + val archiveLocalFiles = Option(args.archives).map { uri => + val resolvedUri = Utils.resolveURI(uri) + val downloadedUri = downloadFileList( + UriBuilder.fromUri(resolvedUri).fragment(null).build().toString, + targetDir, sparkConf, hadoopConf, secMgr) + UriBuilder.fromUri(downloadedUri).fragment(resolvedUri.getFragment).build().toString + }.orNull + args.files = renameResourcesToLocalFS(args.files, filesLocalFiles) + args.archives = renameResourcesToLocalFS(args.archives, archiveLocalFiles) args.pyFiles = renameResourcesToLocalFS(args.pyFiles, localPyFiles) } } diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md index 71b7df8176d1b..e735c7493486e 100644 --- a/docs/running-on-kubernetes.md +++ b/docs/running-on-kubernetes.md @@ -222,7 +222,7 @@ The app jar file will be uploaded to the S3 and then when the driver is launched to the driver pod and will be added to its classpath. Spark will generate a subdir under the upload path with a random name to avoid conflicts with spark apps running in parallel. User could manage the subdirs created according to his needs. -The client scheme is supported for the application jar, and dependencies specified by properties `spark.jars` and `spark.files`. +The client scheme is supported for the application jar, and dependencies specified by properties `spark.jars`, `spark.files` and `spark.archives`. Important: all client-side dependencies will be uploaded to the given path with a flat directory structure so file names must be unique otherwise files will be overwritten. Also make sure in the derived k8s image default ivy dir diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStep.scala index f5ba261c8f405..cec8272beed57 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStep.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStep.scala @@ -16,6 +16,8 @@ */ package org.apache.spark.deploy.k8s.features +import javax.ws.rs.core.UriBuilder + import scala.collection.JavaConverters._ import scala.collection.mutable @@ -159,11 +161,25 @@ private[spark] class BasicDriverFeatureStep(conf: KubernetesDriverConf) KUBERNETES_DRIVER_SUBMIT_CHECK.key -> "true", MEMORY_OVERHEAD_FACTOR.key -> overheadFactor.toString) // try upload local, resolvable files to a hadoop compatible file system - Seq(JARS, FILES, SUBMIT_PYTHON_FILES).foreach { key => - val value = conf.get(key).filter(uri => KubernetesUtils.isLocalAndResolvable(uri)) + Seq(JARS, FILES, ARCHIVES, SUBMIT_PYTHON_FILES).foreach { key => + val uris = conf.get(key).filter(uri => KubernetesUtils.isLocalAndResolvable(uri)) + val value = { + if (key == ARCHIVES) { + uris.map(UriBuilder.fromUri(_).fragment(null).build()).map(_.toString) + } else { + uris + } + } val resolved = KubernetesUtils.uploadAndTransformFileUris(value, Some(conf.sparkConf)) if (resolved.nonEmpty) { - additionalProps.put(key.key, resolved.mkString(",")) + val resolvedValue = if (key == ARCHIVES) { + uris.zip(resolved).map { case (uri, r) => + UriBuilder.fromUri(r).fragment(new java.net.URI(uri).getFragment).build().toString + } + } else { + resolved + } + additionalProps.put(key.key, resolvedValue.mkString(",")) } } additionalProps.toMap diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DepsTestsSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DepsTestsSuite.scala index 760e9ba55d335..a15f7ffa134b8 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DepsTestsSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DepsTestsSuite.scala @@ -163,6 +163,18 @@ private[spark] trait DepsTestsSuite { k8sSuite: KubernetesSuite => }) } + test("SPARK-33615: Launcher client archives", k8sTestTag, MinikubeTag) { + tryDepsTest { + val fileName = Utils.createTempFile(FILE_CONTENTS, HOST_PATH) + Utils.createTarGzFile(s"$HOST_PATH/$fileName", s"$HOST_PATH/$fileName.tar.gz") + sparkAppConf.set("spark.archives", s"$HOST_PATH/$fileName.tar.gz#test_tar_gz") + val examplesJar = Utils.getTestFileAbsolutePath(getExamplesJarName(), sparkHomeDir) + runSparkRemoteCheckAndVerifyCompletion(appResource = examplesJar, + appArgs = Array(s"test_tar_gz/$fileName"), + timeout = Option(DEPS_TIMEOUT)) + } + } + test("Launcher python client dependencies using a zip file", k8sTestTag, MinikubeTag) { val inDepsFile = Utils.getTestFileAbsolutePath("py_container_checks.py", sparkHomeDir) val outDepsFile = s"${inDepsFile.substring(0, inDepsFile.lastIndexOf("."))}.zip" diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/Utils.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/Utils.scala index ee44cb5f85835..519443130008b 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/Utils.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/Utils.scala @@ -25,6 +25,8 @@ import scala.collection.JavaConverters._ import io.fabric8.kubernetes.client.dsl.ExecListener import okhttp3.Response +import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveOutputStream} +import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream import org.apache.commons.compress.utils.IOUtils import org.apache.commons.io.output.ByteArrayOutputStream import org.apache.hadoop.util.VersionInfo @@ -149,4 +151,24 @@ object Utils extends Logging { IOUtils.closeQuietly(fis) IOUtils.closeQuietly(zipOut) } + + def createTarGzFile(inFile: String, outFile: String): Unit = { + val fileToTarGz = new File(inFile) + Utils.tryWithResource( + new FileInputStream(fileToTarGz) + ) { fis => + Utils.tryWithResource( + new TarArchiveOutputStream( + new GzipCompressorOutputStream( + new FileOutputStream( + new File(outFile)))) + ) { tOut => + val tarEntry = new TarArchiveEntry(fileToTarGz, fileToTarGz.getName) + tOut.putArchiveEntry(tarEntry) + IOUtils.copy(fis, tOut) + tOut.closeArchiveEntry() + tOut.finish() + } + } + } } diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index d252e8368a0c4..7f791e02a392b 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -1629,6 +1629,7 @@ private[spark] class YarnClusterApplication extends SparkApplication { // so remove them from sparkConf here for yarn mode. conf.remove(JARS) conf.remove(FILES) + conf.remove(ARCHIVES) new Client(new ClientArguments(args), conf, null).run() } From acc211d2cf0e6ab94f6578e1eb488766fd20fa4e Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Fri, 4 Dec 2020 14:01:15 +0000 Subject: [PATCH 108/150] [SPARK-33141][SQL][FOLLOW-UP] Store the max nested view depth in AnalysisContext ### What changes were proposed in this pull request? This is a followup of https://github.com/apache/spark/pull/30289. It removes the hack in `View.effectiveSQLConf`, by putting the max nested view depth in `AnalysisContext`. Then we don't get the max nested view depth from the active SQLConf, which keeps changing during nested view resolution. ### Why are the changes needed? remove hacks. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? If I just remove the hack, `SimpleSQLViewSuite.restrict the nested level of a view` fails. With this fix, it passes again. Closes #30575 from cloud-fan/view. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan --- .../sql/catalyst/analysis/Analyzer.scala | 49 ++++++++++++------- .../plans/logical/basicLogicalOperators.scala | 3 -- .../spark/sql/execution/SQLViewSuite.scala | 25 ---------- .../sql/execution/SQLViewTestSuite.scala | 32 +++++++++--- 4 files changed, 57 insertions(+), 52 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index ebe1004872ef6..6769dc895d32e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -87,8 +87,8 @@ object FakeV2SessionCatalog extends TableCatalog { } /** - * Provides a way to keep state during the analysis, this enables us to decouple the concerns - * of analysis environment from the catalog. + * Provides a way to keep state during the analysis, mostly for resolving views. This enables us to + * decouple the concerns of analysis environment from the catalog. * The state that is kept here is per-query. * * Note this is thread local. @@ -98,13 +98,21 @@ object FakeV2SessionCatalog extends TableCatalog { * views. * @param nestedViewDepth The nested depth in the view resolution, this enables us to limit the * depth of nested views. + * @param maxNestedViewDepth The maximum allowed depth of nested view resolution. * @param relationCache A mapping from qualified table names to resolved relations. This can ensure * that the table is resolved only once if a table is used multiple times * in a query. + * @param referredTempViewNames All the temp view names referred by the current view we are + * resolving. It's used to make sure the relation resolution is + * consistent between view creation and view resolution. For example, + * if `t` was a permanent table when the current view was created, it + * should still be a permanent table when resolving the current view, + * even if a temp view `t` has been created. */ case class AnalysisContext( catalogAndNamespace: Seq[String] = Nil, nestedViewDepth: Int = 0, + maxNestedViewDepth: Int = -1, relationCache: mutable.Map[Seq[String], LogicalPlan] = mutable.Map.empty, referredTempViewNames: Seq[Seq[String]] = Seq.empty) @@ -118,14 +126,20 @@ object AnalysisContext { private def set(context: AnalysisContext): Unit = value.set(context) - def withAnalysisContext[A]( - catalogAndNamespace: Seq[String], referredTempViewNames: Seq[Seq[String]])(f: => A): A = { + def withAnalysisContext[A](viewDesc: CatalogTable)(f: => A): A = { val originContext = value.get() + val maxNestedViewDepth = if (originContext.maxNestedViewDepth == -1) { + // Here we start to resolve views, get `maxNestedViewDepth` from configs. + SQLConf.get.maxNestedViewDepth + } else { + originContext.maxNestedViewDepth + } val context = AnalysisContext( - catalogAndNamespace, + viewDesc.viewCatalogAndNamespace, originContext.nestedViewDepth + 1, + maxNestedViewDepth, originContext.relationCache, - referredTempViewNames) + viewDesc.viewReferredTempViewNames) set(context) try f finally { set(originContext) } } @@ -1034,18 +1048,19 @@ class Analyzer(override val catalogManager: CatalogManager) // operator. case view @ View(desc, isTempView, _, child) if !child.resolved => // Resolve all the UnresolvedRelations and Views in the child. - val newChild = AnalysisContext.withAnalysisContext( - desc.viewCatalogAndNamespace, desc.viewReferredTempViewNames) { - if (AnalysisContext.get.nestedViewDepth > conf.maxNestedViewDepth) { - view.failAnalysis(s"The depth of view ${desc.identifier} exceeds the maximum " + - s"view resolution depth (${conf.maxNestedViewDepth}). Analysis is aborted to " + - s"avoid errors. Increase the value of ${SQLConf.MAX_NESTED_VIEW_DEPTH.key} to " + - "work around this.") - } - SQLConf.withExistingConf(View.effectiveSQLConf(desc.viewSQLConfigs, isTempView)) { - executeSameContext(child) - } + val newChild = AnalysisContext.withAnalysisContext(desc) { + val nestedViewDepth = AnalysisContext.get.nestedViewDepth + val maxNestedViewDepth = AnalysisContext.get.maxNestedViewDepth + if (nestedViewDepth > maxNestedViewDepth) { + view.failAnalysis(s"The depth of view ${desc.identifier} exceeds the maximum " + + s"view resolution depth ($maxNestedViewDepth). Analysis is aborted to " + + s"avoid errors. Increase the value of ${SQLConf.MAX_NESTED_VIEW_DEPTH.key} to " + + "work around this.") + } + SQLConf.withExistingConf(View.effectiveSQLConf(desc.viewSQLConfigs, isTempView)) { + executeSameContext(child) } + } view.copy(child = newChild) case p @ SubqueryAlias(_, view: View) => p.copy(child = resolveViews(view)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index c8b7e8651686a..aa7151ad36850 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -483,9 +483,6 @@ object View { for ((k, v) <- configs) { sqlConf.settings.put(k, v) } - // We should respect the current maxNestedViewDepth cause the view resolving are executed - // from top to down. - sqlConf.setConf(SQLConf.MAX_NESTED_VIEW_DEPTH, activeConf.maxNestedViewDepth) sqlConf } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala index 709d6321d199d..c4303f0f1e19d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala @@ -704,31 +704,6 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { } } - test("restrict the nested level of a view") { - val viewNames = Array.range(0, 11).map(idx => s"view$idx") - withView(viewNames: _*) { - sql("CREATE VIEW view0 AS SELECT * FROM jt") - Array.range(0, 10).foreach { idx => - sql(s"CREATE VIEW view${idx + 1} AS SELECT * FROM view$idx") - } - - withSQLConf(MAX_NESTED_VIEW_DEPTH.key -> "10") { - val e = intercept[AnalysisException] { - sql("SELECT * FROM view10") - }.getMessage - assert(e.contains("The depth of view `default`.`view0` exceeds the maximum view " + - "resolution depth (10). Analysis is aborted to avoid errors. Increase the value " + - s"of ${MAX_NESTED_VIEW_DEPTH.key} to work around this.")) - } - - val e = intercept[IllegalArgumentException] { - withSQLConf(MAX_NESTED_VIEW_DEPTH.key -> "0") {} - }.getMessage - assert(e.contains("The maximum depth of a view reference in a nested view must be " + - "positive.")) - } - } - test("permanent view should be case-preserving") { withView("v") { sql("CREATE VIEW v AS SELECT 1 as aBc") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala index fb9f5a73f6d9e..3cffc5bc21ab6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala @@ -121,7 +121,7 @@ abstract class SQLViewTestSuite extends QueryTest with SQLTestUtils { test("change current database should not change view behavior") { withTable("t") { Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") - val viewName = createView("v1", "SELECT * from t") + val viewName = createView("v1", "SELECT * FROM t") withView(viewName) { withTempDatabase { db => sql(s"USE $db") @@ -135,7 +135,7 @@ abstract class SQLViewTestSuite extends QueryTest with SQLTestUtils { test("view should read the new data if table is updated") { withTable("t") { Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") - val viewName = createView("v1", "SELECT c1 from t", Seq("c1")) + val viewName = createView("v1", "SELECT c1 FROM t", Seq("c1")) withView(viewName) { Seq(9, 7, 8).toDF("c1").write.mode("overwrite").format("parquet").saveAsTable("t") checkViewOutput(viewName, Seq(Row(9), Row(7), Row(8))) @@ -146,7 +146,7 @@ abstract class SQLViewTestSuite extends QueryTest with SQLTestUtils { test("add column for table should not affect view output") { withTable("t") { Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") - val viewName = createView("v1", "SELECT * from t") + val viewName = createView("v1", "SELECT * FROM t") withView(viewName) { sql("ALTER TABLE t ADD COLUMN (c2 INT)") checkViewOutput(viewName, Seq(Row(2), Row(3), Row(1))) @@ -157,8 +157,8 @@ abstract class SQLViewTestSuite extends QueryTest with SQLTestUtils { test("check cyclic view reference on CREATE OR REPLACE VIEW") { withTable("t") { Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") - val viewName1 = createView("v1", "SELECT * from t") - val viewName2 = createView("v2", s"SELECT * from $viewName1") + val viewName1 = createView("v1", "SELECT * FROM t") + val viewName2 = createView("v2", s"SELECT * FROM $viewName1") withView(viewName2, viewName1) { val e = intercept[AnalysisException] { createView("v1", s"SELECT * FROM $viewName2", replace = true) @@ -171,8 +171,8 @@ abstract class SQLViewTestSuite extends QueryTest with SQLTestUtils { test("check cyclic view reference on ALTER VIEW") { withTable("t") { Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t") - val viewName1 = createView("v1", "SELECT * from t") - val viewName2 = createView("v2", s"SELECT * from $viewName1") + val viewName1 = createView("v1", "SELECT * FROM t") + val viewName2 = createView("v2", s"SELECT * FROM $viewName1") withView(viewName2, viewName1) { val e = intercept[AnalysisException] { sql(s"ALTER VIEW $viewName1 AS SELECT * FROM $viewName2") @@ -181,6 +181,24 @@ abstract class SQLViewTestSuite extends QueryTest with SQLTestUtils { } } } + + test("restrict the nested level of a view") { + val viewNames = scala.collection.mutable.ArrayBuffer.empty[String] + val view0 = createView("view0", "SELECT 1") + viewNames += view0 + for (i <- 1 to 10) { + viewNames += createView(s"view$i", s"SELECT * FROM ${viewNames.last}") + } + withView(viewNames.reverse: _*) { + withSQLConf(MAX_NESTED_VIEW_DEPTH.key -> "10") { + val e = intercept[AnalysisException] { + sql(s"SELECT * FROM ${viewNames.last}") + }.getMessage + assert(e.contains("exceeds the maximum view resolution depth (10)")) + assert(e.contains(s"Increase the value of ${MAX_NESTED_VIEW_DEPTH.key}")) + } + } + } } class LocalTempViewTestSuite extends SQLViewTestSuite with SharedSparkSession { From d671e053e9806d6b4e43a39f5018aa9718790160 Mon Sep 17 00:00:00 2001 From: german Date: Sat, 5 Dec 2020 06:51:54 +0900 Subject: [PATCH 109/150] [SPARK-33660][DOCS][SS] Fix Kafka Headers Documentation ### What changes were proposed in this pull request? Update kafka headers documentation, type is not longer a map but an array [jira](https://issues.apache.org/jira/browse/SPARK-33660) ### Why are the changes needed? To help users ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? It is only documentation Closes #30605 from Gschiavon/SPARK-33660-fix-kafka-headers-documentation. Authored-by: german Signed-off-by: Jungtaek Lim (HeartSaVioR) --- docs/structured-streaming-kafka-integration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/structured-streaming-kafka-integration.md b/docs/structured-streaming-kafka-integration.md index f92dd039d53b7..5336695478c14 100644 --- a/docs/structured-streaming-kafka-integration.md +++ b/docs/structured-streaming-kafka-integration.md @@ -61,7 +61,7 @@ val df = spark .option("includeHeaders", "true") .load() df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)", "headers") - .as[(String, String, Map)] + .as[(String, String, Array[(String, Array[Byte])])] // Subscribe to multiple topics val df = spark From de9818f043c1ebcda321077633f93072feba601f Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Fri, 4 Dec 2020 14:10:42 -0800 Subject: [PATCH 110/150] [SPARK-33662][BUILD] Setting version to 3.2.0-SNAPSHOT ### What changes were proposed in this pull request? This PR aims to update `master` branch version to 3.2.0-SNAPSHOT. ### Why are the changes needed? Start to prepare Apache Spark 3.2.0. ### Does this PR introduce _any_ user-facing change? N/A. ### How was this patch tested? Pass the CIs. Closes #30606 from dongjoon-hyun/SPARK-3.2. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun --- R/pkg/DESCRIPTION | 2 +- assembly/pom.xml | 2 +- common/kvstore/pom.xml | 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml | 2 +- common/network-yarn/pom.xml | 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml | 2 +- common/unsafe/pom.xml | 2 +- core/pom.xml | 2 +- docs/_config.yml | 4 ++-- examples/pom.xml | 2 +- external/avro/pom.xml | 2 +- external/docker-integration-tests/pom.xml | 2 +- external/kafka-0-10-assembly/pom.xml | 2 +- external/kafka-0-10-sql/pom.xml | 2 +- external/kafka-0-10-token-provider/pom.xml | 2 +- external/kafka-0-10/pom.xml | 2 +- external/kinesis-asl-assembly/pom.xml | 2 +- external/kinesis-asl/pom.xml | 2 +- external/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- hadoop-cloud/pom.xml | 2 +- launcher/pom.xml | 2 +- mllib-local/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 2 +- project/MimaExcludes.scala | 5 +++++ python/pyspark/version.py | 2 +- repl/pom.xml | 2 +- resource-managers/kubernetes/core/pom.xml | 2 +- resource-managers/kubernetes/integration-tests/pom.xml | 2 +- resource-managers/mesos/pom.xml | 2 +- resource-managers/yarn/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- 40 files changed, 45 insertions(+), 40 deletions(-) diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 2047f0d75ca18..20433362459d9 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -1,6 +1,6 @@ Package: SparkR Type: Package -Version: 3.1.0 +Version: 3.2.0 Title: R Front End for 'Apache Spark' Description: Provides an R Front end for 'Apache Spark' . Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"), diff --git a/assembly/pom.xml b/assembly/pom.xml index d17abe857ade5..6aa97710f7307 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../pom.xml diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml index 39cdc6d6d6cd3..4ade8c2032b24 100644 --- a/common/kvstore/pom.xml +++ b/common/kvstore/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index d328a7de0a762..0318f60d546e7 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index 562a1d495cc8a..6be6df993478d 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index 0225db81925c5..7aff79ea91d72 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml index 72a2c4ceb43b6..b5a6775366a47 100644 --- a/common/sketch/pom.xml +++ b/common/sketch/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/common/tags/pom.xml b/common/tags/pom.xml index ea16dadca40cb..e51357d97faab 100644 --- a/common/tags/pom.xml +++ b/common/tags/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml index 769e2518b1fd4..b22400575dd02 100644 --- a/common/unsafe/pom.xml +++ b/common/unsafe/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/core/pom.xml b/core/pom.xml index ce6f6ed9c7051..84ca852d1f30a 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../pom.xml diff --git a/docs/_config.yml b/docs/_config.yml index 026b3dd804690..a8d42e483d17d 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -19,8 +19,8 @@ include: # These allow the documentation to be updated with newer releases # of Spark, Scala, and Mesos. -SPARK_VERSION: 3.1.0-SNAPSHOT -SPARK_VERSION_SHORT: 3.1.0 +SPARK_VERSION: 3.2.0-SNAPSHOT +SPARK_VERSION_SHORT: 3.2.0 SCALA_BINARY_VERSION: "2.12" SCALA_VERSION: "2.12.10" MESOS_VERSION: 1.0.0 diff --git a/examples/pom.xml b/examples/pom.xml index 8b632cef6d44d..3d7713f10402f 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../pom.xml diff --git a/external/avro/pom.xml b/external/avro/pom.xml index 98036846eb2a8..a8614c4ff76ab 100644 --- a/external/avro/pom.xml +++ b/external/avro/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml index b240dd281823a..808f48f18e1ff 100644 --- a/external/docker-integration-tests/pom.xml +++ b/external/docker-integration-tests/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml index d9d9fb7f55c77..2359e99f657f9 100644 --- a/external/kafka-0-10-assembly/pom.xml +++ b/external/kafka-0-10-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml index 95a99ac88412e..843f16067463f 100644 --- a/external/kafka-0-10-sql/pom.xml +++ b/external/kafka-0-10-sql/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/external/kafka-0-10-token-provider/pom.xml b/external/kafka-0-10-token-provider/pom.xml index 941946f30e96f..dbe2ab92a28e7 100644 --- a/external/kafka-0-10-token-provider/pom.xml +++ b/external/kafka-0-10-token-provider/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml index 024fdb26d5bf4..69c5862fdbb2d 100644 --- a/external/kafka-0-10/pom.xml +++ b/external/kafka-0-10/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml index 76ee5bb7b2f85..22259b08141da 100644 --- a/external/kinesis-asl-assembly/pom.xml +++ b/external/kinesis-asl-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml index 7e80bd28c19e8..b54ad71eba305 100644 --- a/external/kinesis-asl/pom.xml +++ b/external/kinesis-asl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml index 728b489da6785..bbb71035c3e19 100644 --- a/external/spark-ganglia-lgpl/pom.xml +++ b/external/spark-ganglia-lgpl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/graphx/pom.xml b/graphx/pom.xml index 38836db01553a..3ed68c0652711 100644 --- a/graphx/pom.xml +++ b/graphx/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../pom.xml diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml index 8689e0b8a9ea8..03910ba091997 100644 --- a/hadoop-cloud/pom.xml +++ b/hadoop-cloud/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../pom.xml diff --git a/launcher/pom.xml b/launcher/pom.xml index a2550ac939e83..5da2a496e9eb8 100644 --- a/launcher/pom.xml +++ b/launcher/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../pom.xml diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml index 851af8d52a3ee..2a2c373242201 100644 --- a/mllib-local/pom.xml +++ b/mllib-local/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../pom.xml diff --git a/mllib/pom.xml b/mllib/pom.xml index 9eacf380e17f2..f5b5a979e35b8 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index 80097aec0f429..1d7704055898b 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT pom Spark Project Parent POM http://spark.apache.org/ diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 9405927eb1cb5..33e65c9def41b 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -34,6 +34,10 @@ import com.typesafe.tools.mima.core.ProblemFilters._ */ object MimaExcludes { + // Exclude rules for 3.2.x + lazy val v32excludes = v31excludes ++ Seq( + ) + // Exclude rules for 3.1.x lazy val v31excludes = v30excludes ++ Seq( // mima plugin update caused new incompatibilities to be detected @@ -1742,6 +1746,7 @@ object MimaExcludes { } def excludes(version: String) = version match { + case v if v.startsWith("3.2") => v32excludes case v if v.startsWith("3.1") => v31excludes case v if v.startsWith("3.0") => v30excludes case v if v.startsWith("2.4") => v24excludes diff --git a/python/pyspark/version.py b/python/pyspark/version.py index e8da19fc44185..935795190797f 100644 --- a/python/pyspark/version.py +++ b/python/pyspark/version.py @@ -16,4 +16,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "3.1.0.dev0" +__version__ = "3.2.0.dev0" diff --git a/repl/pom.xml b/repl/pom.xml index a1079e7a6fe6a..a982af21d86f9 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../pom.xml diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml index 18e1c65e2e932..44df4e1da5331 100644 --- a/resource-managers/kubernetes/core/pom.xml +++ b/resource-managers/kubernetes/core/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../../pom.xml diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml index 258d3dfc3df9d..bc680077ead8a 100644 --- a/resource-managers/kubernetes/integration-tests/pom.xml +++ b/resource-managers/kubernetes/integration-tests/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../../pom.xml diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml index 54a8d66ea1ad6..b9b3642498992 100644 --- a/resource-managers/mesos/pom.xml +++ b/resource-managers/mesos/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml index e9122ce202723..1d3856742f520 100644 --- a/resource-managers/yarn/pom.xml +++ b/resource-managers/yarn/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml index 6b79eb722fcdd..0553438a1ad4a 100644 --- a/sql/catalyst/pom.xml +++ b/sql/catalyst/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 3f088e420a9a3..5ab66bd5aac8a 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml index 9cd8adb6cb4df..dd6d21e3cbdac 100644 --- a/sql/hive-thriftserver/pom.xml +++ b/sql/hive-thriftserver/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index 0453094cf8b7b..27d2756c741ef 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../../pom.xml diff --git a/streaming/pom.xml b/streaming/pom.xml index 53b49dd320e94..bd8d352092e73 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../pom.xml diff --git a/tools/pom.xml b/tools/pom.xml index 6e806413ef261..8fe8ab358d60c 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent_2.12 - 3.1.0-SNAPSHOT + 3.2.0-SNAPSHOT ../pom.xml From b6b45bc695706201693572bfb87bcee310548945 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Fri, 4 Dec 2020 15:04:18 -0800 Subject: [PATCH 111/150] [SPARK-33141][SQL][FOLLOW-UP] Fix Scala 2.13 compilation ### What changes were proposed in this pull request? This PR aims to fix Scala 2.13 compilation. ### Why are the changes needed? To recover Scala 2.13. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass GitHub Action Scala 2.13 build job. Closes #30611 from dongjoon-hyun/SPARK-33141. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun --- .../scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala index 3cffc5bc21ab6..f6172e3b65050 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala @@ -189,7 +189,7 @@ abstract class SQLViewTestSuite extends QueryTest with SQLTestUtils { for (i <- 1 to 10) { viewNames += createView(s"view$i", s"SELECT * FROM ${viewNames.last}") } - withView(viewNames.reverse: _*) { + withView(viewNames.reverse.toSeq: _*) { withSQLConf(MAX_NESTED_VIEW_DEPTH.key -> "10") { val e = intercept[AnalysisException] { sql(s"SELECT * FROM ${viewNames.last}") From 960d6af75d5ef29b1efcf0d03e7db840270382e6 Mon Sep 17 00:00:00 2001 From: allisonwang-db <66282705+allisonwang-db@users.noreply.github.com> Date: Fri, 4 Dec 2020 15:15:19 -0800 Subject: [PATCH 112/150] [SPARK-33472][SQL][FOLLOW-UP] Update RemoveRedundantSorts comment ### What changes were proposed in this pull request? This PR is a follow-up for #30373 that updates the comment for RemoveRedundantSorts in QueryExecution. ### Why are the changes needed? To update an incorrect comment. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? N/A Closes #30584 from allisonwang-db/spark-33472-followup. Authored-by: allisonwang-db <66282705+allisonwang-db@users.noreply.github.com> Signed-off-by: Dongjoon Hyun --- .../scala/org/apache/spark/sql/execution/QueryExecution.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala index 040d1f36ed8a5..0531dd210e539 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala @@ -344,7 +344,7 @@ object QueryExecution { PlanSubqueries(sparkSession), RemoveRedundantProjects, EnsureRequirements, - // `RemoveRedundantSorts` needs to be added before `EnsureRequirements` to guarantee the same + // `RemoveRedundantSorts` needs to be added after `EnsureRequirements` to guarantee the same // number of partitions when instantiating PartitioningCollection. RemoveRedundantSorts, DisableUnnecessaryBucketedScan, From 1b4e35d1a8acf7b744e11b9ac9ca8f81de6db5e5 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Fri, 4 Dec 2020 16:48:31 -0800 Subject: [PATCH 113/150] [SPARK-33651][SQL] Allow CREATE EXTERNAL TABLE with LOCATION for data source tables ### What changes were proposed in this pull request? This PR removes the restriction and allows CREATE EXTERNAL TABLE with LOCATION for data source tables. It also moves the check from the analyzer rule `ResolveSessionCatalog` to `SessionCatalog`, so that v2 session catalog can overwrite it. ### Why are the changes needed? It's an unnecessary behavior difference that Hive serde table can be created with `CREATE EXTERNAL TABLE` if LOCATION is present, while data source table doesn't allow `CREATE EXTERNAL TABLE` at all. ### Does this PR introduce _any_ user-facing change? Yes, now `CREATE EXTERNAL TABLE ... USING ... LOCATION ...` is allowed. ### How was this patch tested? new tests Closes #30595 from cloud-fan/minor. Authored-by: Wenchen Fan Signed-off-by: Dongjoon Hyun --- .../sql/catalyst/catalog/SessionCatalog.scala | 5 ++++ .../analysis/ResolveSessionCatalog.scala | 12 +-------- .../datasources/v2/V2SessionCatalog.scala | 7 +++++- .../DataSourceV2SQLSessionCatalogSuite.scala | 8 ++++++ .../connector/TestV2SessionCatalogBase.scala | 24 +++++++++++++++--- .../command/PlanResolutionSuite.scala | 14 ++++++----- .../sources/CreateTableAsSelectSuite.scala | 25 ++++++++----------- .../spark/sql/sources/InsertSuite.scala | 2 +- .../sql/hive/MetastoreDataSourcesSuite.scala | 5 ++-- 9 files changed, 64 insertions(+), 38 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 29481b85e9f2e..0cdbc1a234c66 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -318,6 +318,11 @@ class SessionCatalog( tableDefinition: CatalogTable, ignoreIfExists: Boolean, validateLocation: Boolean = true): Unit = { + val isExternal = tableDefinition.tableType == CatalogTableType.EXTERNAL + if (isExternal && tableDefinition.storage.locationUri.isEmpty) { + throw new AnalysisException(s"CREATE EXTERNAL TABLE must be accompanied by LOCATION") + } + val db = formatDatabaseName(tableDefinition.identifier.database.getOrElse(getCurrentDatabase)) val table = formatTableName(tableDefinition.identifier.table) val tableIdentifier = TableIdentifier(table, Some(db)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index f35fcdc07c372..a87ed4b6275d8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -662,17 +662,7 @@ class ResolveSessionCatalog( comment: Option[String], storageFormat: CatalogStorageFormat, external: Boolean): CatalogTable = { - if (external) { - if (DDLUtils.isHiveTable(Some(provider))) { - if (location.isEmpty) { - throw new AnalysisException(s"CREATE EXTERNAL TABLE must be accompanied by LOCATION") - } - } else { - throw new AnalysisException(s"Operation not allowed: CREATE EXTERNAL TABLE ... USING") - } - } - - val tableType = if (location.isDefined) { + val tableType = if (external || location.isDefined) { CatalogTableType.EXTERNAL } else { CatalogTableType.MANAGED diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala index a0bc65d3f9057..87f5366354fa0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala @@ -87,7 +87,12 @@ class V2SessionCatalog(catalog: SessionCatalog) val location = Option(properties.get(TableCatalog.PROP_LOCATION)) val storage = DataSource.buildStorageFormatFromOptions(toOptions(tableProperties.toMap)) .copy(locationUri = location.map(CatalogUtils.stringToURI)) - val tableType = if (location.isDefined) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED + val isExternal = properties.containsKey(TableCatalog.PROP_EXTERNAL) + val tableType = if (isExternal || location.isDefined) { + CatalogTableType.EXTERNAL + } else { + CatalogTableType.MANAGED + } val tableDesc = CatalogTable( identifier = ident.asTableIdentifier, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala index cf00b3b5e4410..c973e2ba30004 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala @@ -79,4 +79,12 @@ class DataSourceV2SQLSessionCatalogSuite Row("keyX", s"Table default.$t1 does not have property: keyX")) } } + + test("SPARK-33651: allow CREATE EXTERNAL TABLE without LOCATION") { + withTable("t") { + val prop = TestV2SessionCatalogBase.SIMULATE_ALLOW_EXTERNAL_PROPERTY + "=true" + // The following should not throw AnalysisException. + sql(s"CREATE EXTERNAL TABLE t (i INT) USING $v2Format TBLPROPERTIES($prop)") + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/TestV2SessionCatalogBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/TestV2SessionCatalogBase.scala index f57edb9eb220c..bf2749d1afc53 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/TestV2SessionCatalogBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/TestV2SessionCatalogBase.scala @@ -24,7 +24,7 @@ import java.util.concurrent.atomic.AtomicBoolean import scala.collection.JavaConverters._ import org.apache.spark.sql.catalyst.catalog.CatalogTableType -import org.apache.spark.sql.connector.catalog.{DelegatingCatalogExtension, Identifier, Table, V1Table} +import org.apache.spark.sql.connector.catalog.{DelegatingCatalogExtension, Identifier, Table, TableCatalog, V1Table} import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.types.StructType @@ -70,8 +70,22 @@ private[connector] trait TestV2SessionCatalogBase[T <: Table] extends Delegating schema: StructType, partitions: Array[Transform], properties: util.Map[String, String]): Table = { - val created = super.createTable(ident, schema, partitions, properties) - val t = newTable(created.name(), schema, partitions, properties) + val key = TestV2SessionCatalogBase.SIMULATE_ALLOW_EXTERNAL_PROPERTY + val propsWithLocation = if (properties.containsKey(key)) { + // Always set a location so that CREATE EXTERNAL TABLE won't fail with LOCATION not specified. + if (!properties.containsKey(TableCatalog.PROP_LOCATION)) { + val newProps = new util.HashMap[String, String]() + newProps.putAll(properties) + newProps.put(TableCatalog.PROP_LOCATION, "file:/abc") + newProps + } else { + properties + } + } else { + properties + } + val created = super.createTable(ident, schema, partitions, propsWithLocation) + val t = newTable(created.name(), schema, partitions, propsWithLocation) addTable(ident, t) t } @@ -90,3 +104,7 @@ private[connector] trait TestV2SessionCatalogBase[T <: Table] extends Delegating tableCreated.set(false) } } + +object TestV2SessionCatalogBase { + val SIMULATE_ALLOW_EXTERNAL_PROPERTY = "spark.sql.test.simulateAllowExternal" +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala index 9b7222da55368..38719311f1aef 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala @@ -1714,14 +1714,16 @@ class PlanResolutionSuite extends AnalysisTest { } } - test("create hive external table - location must be specified") { - val exc = intercept[AnalysisException] { - parseAndResolve("CREATE EXTERNAL TABLE my_tab STORED AS parquet") + test("create hive external table") { + val withoutLoc = "CREATE EXTERNAL TABLE my_tab STORED AS parquet" + parseAndResolve(withoutLoc) match { + case ct: CreateTable => + assert(ct.tableDesc.tableType == CatalogTableType.EXTERNAL) + assert(ct.tableDesc.storage.locationUri.isEmpty) } - assert(exc.getMessage.contains("CREATE EXTERNAL TABLE must be accompanied by LOCATION")) - val query = "CREATE EXTERNAL TABLE my_tab STORED AS parquet LOCATION '/something/anything'" - parseAndResolve(query) match { + val withLoc = "CREATE EXTERNAL TABLE my_tab STORED AS parquet LOCATION '/something/anything'" + parseAndResolve(withLoc) match { case ct: CreateTable => assert(ct.tableDesc.tableType == CatalogTableType.EXTERNAL) assert(ct.tableDesc.storage.locationUri == Some(new URI("/something/anything"))) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala index 00c599065ce31..9464f7e4c1241 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala @@ -22,7 +22,7 @@ import java.io.File import org.apache.spark.SparkException import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.catalog.BucketSpec +import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTableType} import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.internal.SQLConf.BUCKETING_MAX_BUCKETS import org.apache.spark.sql.test.SharedSparkSession @@ -170,20 +170,17 @@ class CreateTableAsSelectSuite extends DataSourceTest with SharedSparkSession { } } - test("disallows CREATE EXTERNAL TABLE ... USING ... AS query") { + test("SPARK-33651: allow CREATE EXTERNAL TABLE ... USING ... if location is specified") { withTable("t") { - val error = intercept[AnalysisException] { - sql( - s""" - |CREATE EXTERNAL TABLE t USING PARQUET - |OPTIONS (PATH '${path.toURI}') - |AS SELECT 1 AS a, 2 AS b - """.stripMargin - ) - }.getMessage - - assert(error.contains("Operation not allowed") && - error.contains("CREATE EXTERNAL TABLE ...")) + sql( + s""" + |CREATE EXTERNAL TABLE t USING PARQUET + |OPTIONS (PATH '${path.toURI}') + |AS SELECT 1 AS a, 2 AS b + """.stripMargin) + val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) + assert(table.tableType == CatalogTableType.EXTERNAL) + assert(table.location.toString == path.toURI.toString.stripSuffix("/")) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala index aaf8765c04425..bfd04ffaaf754 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala @@ -845,7 +845,7 @@ class InsertSuite extends DataSourceTest with SharedSparkSession { .add("s", StringType, false) val newTable = CatalogTable( identifier = TableIdentifier("test_table", None), - tableType = CatalogTableType.EXTERNAL, + tableType = CatalogTableType.MANAGED, storage = CatalogStorageFormat( locationUri = None, inputFormat = None, diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index 41a26344f7c21..0593dbe7f6653 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -711,7 +711,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv identifier = TableIdentifier("wide_schema"), tableType = CatalogTableType.EXTERNAL, storage = CatalogStorageFormat.empty.copy( - properties = Map("path" -> tempDir.getCanonicalPath) + locationUri = Some(tempDir.toURI) ), schema = schema, provider = Some("json") @@ -1076,7 +1076,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv identifier = TableIdentifier("skip_hive_metadata", Some("default")), tableType = CatalogTableType.EXTERNAL, storage = CatalogStorageFormat.empty.copy( - properties = Map("path" -> tempPath.getCanonicalPath, "skipHiveMetadata" -> "true") + locationUri = Some(tempPath.toURI), + properties = Map("skipHiveMetadata" -> "true") ), schema = schema, provider = Some("parquet") From 154f6044033d1a3b4c19c64b206b168bf919cb3b Mon Sep 17 00:00:00 2001 From: Terry Kim Date: Sun, 6 Dec 2020 12:03:14 +0900 Subject: [PATCH 114/150] [MINOR] Fix string interpolation in CommandUtils.scala and KafkaDataConsumer.scala ### What changes were proposed in this pull request? This PR proposes to fix a string interpolation in `CommandUtils.scala` and `KafkaDataConsumer.scala`. ### Why are the changes needed? To fix a string interpolation bug. ### Does this PR introduce _any_ user-facing change? Yes, the string will be correctly constructed. ### How was this patch tested? Existing tests since they were used in exception/log messages. Closes #30609 from imback82/fix_cache_str_interporlation. Authored-by: Terry Kim Signed-off-by: HyukjinKwon --- .../apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala | 2 +- .../org/apache/spark/sql/execution/command/CommandUtils.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala index f2bf7cd1360ec..649430d434a73 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala @@ -276,7 +276,7 @@ private[kafka010] class KafkaDataConsumer( val fetchedData = getOrRetrieveFetchedData(offset) logDebug(s"Get $groupId $topicPartition nextOffset ${fetchedData.nextOffsetInFetchedData} " + - "requested $offset") + s"requested $offset") // The following loop is basically for `failOnDataLoss = false`. When `failOnDataLoss` is // `false`, first, we will try to fetch the record at `offset`. If no such record exists, then diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala index f86f62bbf853b..15a735be8043f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala @@ -391,7 +391,7 @@ object CommandUtils extends Logging { try { sparkSession.catalog.uncacheTable(name) } catch { - case NonFatal(e) => logWarning("Exception when attempting to uncache $name", e) + case NonFatal(e) => logWarning(s"Exception when attempting to uncache $name", e) } } } From 6317ba29a1bb1b7198fe8df71ddefcf47a55bd51 Mon Sep 17 00:00:00 2001 From: Prashant Sharma Date: Sat, 5 Dec 2020 23:04:55 -0800 Subject: [PATCH 115/150] [SPARK-33668][K8S][TEST] Fix flaky test "Verify logging configuration is picked from the provided ### What changes were proposed in this pull request? Fix flaky test "Verify logging configuration is picked from the provided SPARK_CONF_DIR/log4j.properties." The test is flaking, with multiple flaked instances - the reason for the failure has been similar to: ``` The code passed to eventually never returned normally. Attempted 109 times over 3.0079882413999997 minutes. Last failure message: Failure executing: GET at: https://192.168.39.167:8443/api/v1/namespaces/b37fc72a991b49baa68a2eaaa1516463/pods/spark-pi-97a9bc76308e7fe3-exec-1/log?pretty=false. Message: pods "spark-pi-97a9bc76308e7fe3-exec-1" not found. Received status: Status(apiVersion=v1, code=404, details=StatusDetails(causes=[], group=null, kind=pods, name=spark-pi-97a9bc76308e7fe3-exec-1, retryAfterSeconds=null, uid=null, additionalProperties={}), kind=Status, message=pods "spark-pi-97a9bc76308e7fe3-exec-1" not found, metadata=ListMeta(_continue=null, remainingItemCount=null, resourceVersion=null, selfLink=null, additionalProperties={}), reason=NotFound, status=Failure, additionalProperties={}).. (KubernetesSuite.scala:402) ``` https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/36854/console https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/36852/console https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/36850/console https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/36848/console From the above failures, it seems, that executor finishes too quickly and is removed by spark before the test can complete. So, in order to mitigate this situation, one way is to turn on the flag "spark.kubernetes.executor.deleteOnTermination" ### Why are the changes needed? Fixes a flaky test. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing tests. May be a few runs of jenkins integration test, may reveal if the problem is resolved or not. Closes #30616 from ScrapCodes/SPARK-33668/fix-flaky-k8s-integration-test. Authored-by: Prashant Sharma Signed-off-by: Dongjoon Hyun --- .../k8s/integrationtest/KubernetesSuite.scala | 18 ++++++++++++++++++ .../SparkConfPropagateSuite.scala | 1 + 2 files changed, 19 insertions(+) diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala index 193a02aad0cea..7b2a2d0820238 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala @@ -158,6 +158,7 @@ class KubernetesSuite extends SparkFunSuite kubernetesTestComponents.deleteNamespace() } deleteDriverPod() + deleteExecutorPod(appLocator) } protected def runSparkPiAndVerifyCompletion( @@ -508,6 +509,23 @@ class KubernetesSuite extends SparkFunSuite .get() == null) } } + + private def deleteExecutorPod(appLocator: String): Unit = { + kubernetesTestComponents + .kubernetesClient + .pods() + .withLabel("spark-app-locator", appLocator) + .withLabel("spark-role", "executor") + .delete() + Eventually.eventually(TIMEOUT, INTERVAL) { + assert(kubernetesTestComponents.kubernetesClient + .pods() + .withLabel("spark-app-locator", appLocator) + .withLabel("spark-role", "executor") + .list() + .getItems.isEmpty) + } + } } private[spark] object KubernetesSuite { diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/SparkConfPropagateSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/SparkConfPropagateSuite.scala index 5d3b426598fdd..0bc632716fa8b 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/SparkConfPropagateSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/SparkConfPropagateSuite.scala @@ -39,6 +39,7 @@ private[spark] trait SparkConfPropagateSuite { k8sSuite: KubernetesSuite => sparkAppConf.set("spark.driver.extraJavaOptions", "-Dlog4j.debug") sparkAppConf.set("spark.executor.extraJavaOptions", "-Dlog4j.debug") + sparkAppConf.set("spark.kubernetes.executor.deleteOnTermination", "false") val log4jExpectedLog = s"log4j: Reading configuration from URL file:/opt/spark/conf/log4j.properties" From e857e06452c2cf478beb31367f76d6950b660ebb Mon Sep 17 00:00:00 2001 From: Chao Sun Date: Sun, 6 Dec 2020 01:14:22 -0800 Subject: [PATCH 116/150] [SPARK-33652][SQL] DSv2: DeleteFrom should refresh cache ### What changes were proposed in this pull request? This changes `DeleteFromTableExec` to also refresh caches referencing the original table, by passing the `refreshCache` callback to the class. Note that in order to construct the callback, I have to change `DataSourceV2ScanRelation` to contain a `DataSourceV2Relation` instead of a `Table`. ### Why are the changes needed? Currently DSv2 delete from table doesn't refresh caches. This could lead to correctness issue if the staled cache is queried later. ### Does this PR introduce _any_ user-facing change? Yes. Now delete from table in v2 also refreshes cache. ### How was this patch tested? Added a test case. Closes #30597 from sunchao/SPARK-33652. Authored-by: Chao Sun Signed-off-by: Dongjoon Hyun --- .../datasources/v2/DataSourceV2Relation.scala | 6 +++--- .../scala/org/apache/spark/sql/Dataset.scala | 4 ++-- .../datasources/v2/DataSourceV2Strategy.scala | 5 +++-- .../datasources/v2/DeleteFromTableExec.scala | 4 +++- .../datasources/v2/V2ScanRelationPushDown.scala | 2 +- .../sql/connector/DataSourceV2SQLSuite.scala | 16 ++++++++++++++++ 6 files changed, 28 insertions(+), 9 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala index 4debdd380e6b4..513fce0aba10c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala @@ -111,16 +111,16 @@ case class DataSourceV2Relation( * plan. This ensures that the stats that are used by the optimizer account for the filters and * projection that will be pushed down. * - * @param table a DSv2 [[Table]] + * @param relation a [[DataSourceV2Relation]] * @param scan a DSv2 [[Scan]] * @param output the output attributes of this relation */ case class DataSourceV2ScanRelation( - table: Table, + relation: DataSourceV2Relation, scan: Scan, output: Seq[AttributeReference]) extends LeafNode with NamedRelation { - override def name: String = table.name() + override def name: String = relation.table.name() override def simpleString(maxFields: Int): String = { s"RelationV2${truncatedString(output, "[", ", ", "]", maxFields)} $name" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 0716043bcf660..05d6647afd958 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -53,7 +53,7 @@ import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression import org.apache.spark.sql.execution.arrow.{ArrowBatchStreamWriter, ArrowConverters} import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources.LogicalRelation -import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2ScanRelation, FileTable} +import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, DataSourceV2ScanRelation, FileTable} import org.apache.spark.sql.execution.python.EvaluatePython import org.apache.spark.sql.execution.stat.StatFunctions import org.apache.spark.sql.internal.SQLConf @@ -3464,7 +3464,7 @@ class Dataset[T] private[sql]( fr.inputFiles case r: HiveTableRelation => r.tableMeta.storage.locationUri.map(_.toString).toArray - case DataSourceV2ScanRelation(table: FileTable, _, _) => + case DataSourceV2ScanRelation(DataSourceV2Relation(table: FileTable, _, _, _, _), _, _) => table.fileIndex.inputFiles }.flatten files.toSet.toArray diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index 938ba77fede47..5289d359f7809 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -208,7 +208,8 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat case DeleteFromTable(relation, condition) => relation match { - case DataSourceV2ScanRelation(table, _, output) => + case DataSourceV2ScanRelation(r, _, output) => + val table = r.table if (condition.exists(SubqueryExpression.hasSubquery)) { throw new AnalysisException( s"Delete by condition with subquery is not supported: $condition") @@ -227,7 +228,7 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat s"Cannot delete from table ${table.name} where ${filters.mkString("[", ", ", "]")}") } - DeleteFromTableExec(table.asDeletable, filters) :: Nil + DeleteFromTableExec(table.asDeletable, filters, refreshCache(r)) :: Nil case _ => throw new AnalysisException("DELETE is only supported with v2 tables.") } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DeleteFromTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DeleteFromTableExec.scala index afebbfd01db22..f0a45c249dc10 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DeleteFromTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DeleteFromTableExec.scala @@ -24,10 +24,12 @@ import org.apache.spark.sql.sources.Filter case class DeleteFromTableExec( table: SupportsDelete, - condition: Array[Filter]) extends V2CommandExec { + condition: Array[Filter], + refreshCache: () => Unit) extends V2CommandExec { override protected def run(): Seq[InternalRow] = { table.deleteWhere(condition) + refreshCache() Seq.empty } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala index b168e848f0b6f..d2180566790ac 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala @@ -64,7 +64,7 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] { case _ => scan } - val scanRelation = DataSourceV2ScanRelation(relation.table, wrappedScan, output) + val scanRelation = DataSourceV2ScanRelation(relation, wrappedScan, output) val projectionOverSchema = ProjectionOverSchema(output.toStructType) val projectionFunc = (expr: Expression) => expr transformDown { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index 6ef4fd1372a78..6838a7644a29f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -1841,6 +1841,22 @@ class DataSourceV2SQLSuite } } + test("SPARK-33652: DeleteFrom should refresh caches referencing the table") { + val t = "testcat.ns1.ns2.tbl" + val view = "view" + withTable(t) { + withTempView(view) { + sql(s"CREATE TABLE $t (id bigint, data string, p int) USING foo PARTITIONED BY (id, p)") + sql(s"INSERT INTO $t VALUES (2L, 'a', 2), (2L, 'b', 3), (3L, 'c', 3)") + sql(s"CACHE TABLE view AS SELECT id FROM $t") + assert(spark.table(view).count() == 3) + + sql(s"DELETE FROM $t WHERE id = 2") + assert(spark.table(view).count() == 1) + } + } + } + test("UPDATE TABLE") { val t = "testcat.ns1.ns2.tbl" withTable(t) { From 5250841537d7a8c54fb451748e2a21d3bcc5d966 Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Sun, 6 Dec 2020 01:22:24 -0800 Subject: [PATCH 117/150] [SPARK-33256][PYTHON][DOCS] Clarify PySpark follows NumPy documentation style ### What changes were proposed in this pull request? This PR adds few lines about docstring style to document that PySpark follows [NumPy documentation style](https://numpydoc.readthedocs.io/en/latest/format.html). We all completed the migration to NumPy documentation style at SPARK-32085. Ideally we should have a page like https://pandas.pydata.org/docs/development/contributing_docstring.html but I would like to leave it as a future work. ### Why are the changes needed? To tell developers that PySpark now follows NumPy documentation style. ### Does this PR introduce _any_ user-facing change? No, it's a change in unreleased branches yet. ### How was this patch tested? Manually tested via `make clean html` under `python/docs`: ![Screen Shot 2020-12-06 at 1 34 50 PM](https://user-images.githubusercontent.com/6477701/101271623-d5ce0380-37c7-11eb-93ac-da73caa50c37.png) Closes #30622 from HyukjinKwon/SPARK-33256. Authored-by: HyukjinKwon Signed-off-by: Dongjoon Hyun --- python/docs/source/development/contributing.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/docs/source/development/contributing.rst b/python/docs/source/development/contributing.rst index 2b62c953e0786..a41b8a1a1de9e 100644 --- a/python/docs/source/development/contributing.rst +++ b/python/docs/source/development/contributing.rst @@ -123,11 +123,12 @@ Annotations can be validated using ``dev/lint-python`` script or by invoking myp -Code Style Guide ----------------- +Code and Docstring Guide +---------------------------------- Please follow the style of the existing codebase as is, which is virtually PEP 8 with one exception: lines can be up to 100 characters in length, not 79. +For the docstring style, PySpark follows `NumPy documentation style `_. Note that the method and variable names in PySpark are the similar case is ``threading`` library in Python itself where the APIs were inspired by Java. PySpark also follows `camelCase` for exposed APIs that match with Scala and Java. From 48297818f37a8e02cc02ba6fa9ec04fe37540aca Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Sun, 6 Dec 2020 02:56:08 -0800 Subject: [PATCH 118/150] [SPARK-33667][SQL] Respect the `spark.sql.caseSensitive` config while resolving partition spec in v1 `SHOW PARTITIONS` ### What changes were proposed in this pull request? Preprocess the partition spec passed to the V1 SHOW PARTITIONS implementation `ShowPartitionsCommand`, and normalize the passed spec according to the partition columns w.r.t the case sensitivity flag **spark.sql.caseSensitive**. ### Why are the changes needed? V1 SHOW PARTITIONS is case sensitive in fact, and doesn't respect the SQL config **spark.sql.caseSensitive** which is false by default, for instance: ```sql spark-sql> CREATE TABLE tbl1 (price int, qty int, year int, month int) > USING parquet > PARTITIONED BY (year, month); spark-sql> INSERT INTO tbl1 PARTITION(year = 2015, month = 1) SELECT 1, 1; spark-sql> SHOW PARTITIONS tbl1 PARTITION(YEAR = 2015, Month = 1); Error in query: Non-partitioning column(s) [YEAR, Month] are specified for SHOW PARTITIONS; ``` The `SHOW PARTITIONS` command must show the partition `year = 2015, month = 1` specified by `YEAR = 2015, Month = 1`. ### Does this PR introduce _any_ user-facing change? Yes. After the changes, the command above works as expected: ```sql spark-sql> SHOW PARTITIONS tbl1 PARTITION(YEAR = 2015, Month = 1); year=2015/month=1 ``` ### How was this patch tested? By running the affected test suites: - `v1/ShowPartitionsSuite` - `v2/ShowPartitionsSuite` Closes #30615 from MaxGekk/show-partitions-case-sensitivity-test. Authored-by: Max Gekk Signed-off-by: Dongjoon Hyun --- .../spark/sql/execution/command/tables.scala | 18 ++++++------ .../command/ShowPartitionsSuiteBase.scala | 28 +++++++++++++++++-- .../command/v1/ShowPartitionsSuite.scala | 4 --- .../command/v2/ShowPartitionsSuite.scala | 4 --- 4 files changed, 34 insertions(+), 20 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 9e3ca3c321a54..59adb7dd7e319 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -1006,20 +1006,18 @@ case class ShowPartitionsCommand( DDLUtils.verifyPartitionProviderIsHive(sparkSession, table, "SHOW PARTITIONS") /** - * Validate the partitioning spec by making sure all the referenced columns are + * Normalizes the partition spec w.r.t the partition columns and case sensitivity settings, + * and validates the spec by making sure all the referenced columns are * defined as partitioning columns in table definition. An AnalysisException exception is * thrown if the partitioning spec is invalid. */ - if (spec.isDefined) { - val badColumns = spec.get.keySet.filterNot(table.partitionColumnNames.contains) - if (badColumns.nonEmpty) { - val badCols = badColumns.mkString("[", ", ", "]") - throw new AnalysisException( - s"Non-partitioning column(s) $badCols are specified for SHOW PARTITIONS") - } - } + val normalizedSpec = spec.map(partitionSpec => PartitioningUtils.normalizePartitionSpec( + partitionSpec, + table.partitionColumnNames, + table.identifier.quotedString, + sparkSession.sessionState.conf.resolver)) - val partNames = catalog.listPartitionNames(tableName, spec) + val partNames = catalog.listPartitionNames(tableName, normalizedSpec) partNames.map(Row(_)) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsSuiteBase.scala index 82457f96a3003..b695decdb3ec9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsSuiteBase.scala @@ -21,6 +21,7 @@ import org.scalactic.source.Position import org.scalatest.Tag import org.apache.spark.sql.{AnalysisException, QueryTest, Row} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.{StringType, StructType} @@ -28,7 +29,6 @@ trait ShowPartitionsSuiteBase extends QueryTest with SQLTestUtils { protected def version: String protected def catalog: String protected def defaultUsing: String - protected def wrongPartitionColumnsError(columns: String*): String // Gets the schema of `SHOW PARTITIONS` private val showSchema: StructType = new StructType().add("partition", StringType, false) protected def runShowPartitionsSql(sqlText: String, expected: Seq[Row]): Unit = { @@ -94,7 +94,7 @@ trait ShowPartitionsSuiteBase extends QueryTest with SQLTestUtils { val errMsg = intercept[AnalysisException] { sql(s"SHOW PARTITIONS $table PARTITION(abcd=2015, xyz=1)") }.getMessage - assert(errMsg.contains(wrongPartitionColumnsError("abcd", "xyz"))) + assert(errMsg.contains("abcd is not a valid partition column")) } } } @@ -149,4 +149,28 @@ trait ShowPartitionsSuiteBase extends QueryTest with SQLTestUtils { } } } + + test("SPARK-33667: case sensitivity of partition spec") { + withNamespace(s"$catalog.ns") { + sql(s"CREATE NAMESPACE $catalog.ns") + val t = s"$catalog.ns.part_table" + withTable(t) { + sql(s""" + |CREATE TABLE $t (price int, qty int, year int, month int) + |$defaultUsing + |PARTITIONED BY (year, month)""".stripMargin) + sql(s"INSERT INTO $t PARTITION(year = 2015, month = 1) SELECT 1, 1") + Seq( + true -> "PARTITION(year = 2015, month = 1)", + false -> "PARTITION(YEAR = 2015, Month = 1)" + ).foreach { case (caseSensitive, partitionSpec) => + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + runShowPartitionsSql( + s"SHOW PARTITIONS $t $partitionSpec", + Row("year=2015/month=1") :: Nil) + } + } + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowPartitionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowPartitionsSuite.scala index 2b2bc9e63dc82..c752a5f358bb9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowPartitionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowPartitionsSuite.scala @@ -27,10 +27,6 @@ trait ShowPartitionsSuiteBase extends command.ShowPartitionsSuiteBase { override def catalog: String = CatalogManager.SESSION_CATALOG_NAME override def defaultUsing: String = "USING parquet" - override protected def wrongPartitionColumnsError(columns: String*): String = { - s"Non-partitioning column(s) ${columns.mkString("[", ", ", "]")} are specified" - } - test("show everything in the default database") { val table = "dateTable" withTable(table) { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowPartitionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowPartitionsSuite.scala index ca47a713ad604..55985a335c94b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowPartitionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowPartitionsSuite.scala @@ -32,10 +32,6 @@ class ShowPartitionsSuite extends command.ShowPartitionsSuiteBase with SharedSpa .set(s"spark.sql.catalog.$catalog", classOf[InMemoryPartitionTableCatalog].getName) .set(s"spark.sql.catalog.non_part_$catalog", classOf[InMemoryTableCatalog].getName) - override protected def wrongPartitionColumnsError(columns: String*): String = { - s"${columns.head} is not a valid partition column" - } - test("a table does not support partitioning") { val table = s"non_part_$catalog.tab1" withTable(table) { From b94ecf0734b829878956d98b74323e0c80822fec Mon Sep 17 00:00:00 2001 From: Xiao Li Date: Sun, 6 Dec 2020 22:36:34 +0800 Subject: [PATCH 119/150] [SPARK-33674][TEST] Show Slowpoke notifications in SBT tests ### What changes were proposed in this pull request? This PR is to show Slowpoke notifications in the log when running tests using SBT. For example, the test case "zero sized blocks" in ExternalShuffleServiceSuite enters the infinite loop. After this change, the log file will have a notification message every 5 minute when the test case running longer than two minutes. Below is an example message. ``` [info] ExternalShuffleServiceSuite: [info] - groupByKey without compression (101 milliseconds) [info] - shuffle non-zero block size (3 seconds, 186 milliseconds) [info] - shuffle serializer (3 seconds, 189 milliseconds) [info] *** Test still running after 2 minute, 1 seconds: suite name: ExternalShuffleServiceSuite, test name: zero sized blocks. [info] *** Test still running after 7 minute, 1 seconds: suite name: ExternalShuffleServiceSuite, test name: zero sized blocks. [info] *** Test still running after 12 minutes, 1 seconds: suite name: ExternalShuffleServiceSuite, test name: zero sized blocks. [info] *** Test still running after 17 minutes, 1 seconds: suite name: ExternalShuffleServiceSuite, test name: zero sized blocks. ``` ### Why are the changes needed? When the tests/code has bug and enters the infinite loop, it is hard to tell which test cases hit some issues from the log, especially when we are running the tests in parallel. It would be nice to show the Slowpoke notifications. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Manual testing in my local dev environment. Closes #30621 from gatorsmile/addSlowpoke. Authored-by: Xiao Li Signed-off-by: Yuming Wang --- project/SparkBuild.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index a5951e0452943..23fb73d228e01 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -1055,6 +1055,9 @@ object TestSettings { }.getOrElse(Nil): _*), // Show full stack trace and duration in test cases. testOptions in Test += Tests.Argument("-oDF"), + // Slowpoke notifications: receive notifications every 5 minute of tests that have been running + // longer than two minutes. + testOptions in Test += Tests.Argument(TestFrameworks.ScalaTest, "-W", "120", "300"), testOptions in Test += Tests.Argument(TestFrameworks.JUnit, "-v", "-a"), // Enable Junit testing. libraryDependencies += "com.novocode" % "junit-interface" % "0.11" % "test", From 119539fd493af5ed0e37af79320787f145eaf3f1 Mon Sep 17 00:00:00 2001 From: Terry Kim Date: Mon, 7 Dec 2020 09:48:16 +0900 Subject: [PATCH 120/150] [SPARK-33663][SQL] Uncaching should not be called on non-existing temp views ### What changes were proposed in this pull request? This PR proposes to fix a misleading logs in the following scenario when uncaching is called on non-existing views: ``` scala> sql("CREATE TABLE table USING parquet AS SELECT 2") res0: org.apache.spark.sql.DataFrame = [] scala> val df = spark.table("table") df: org.apache.spark.sql.DataFrame = [2: int] scala> df.createOrReplaceTempView("t2") 20/12/04 10:16:24 WARN CommandUtils: Exception when attempting to uncache $name org.apache.spark.sql.AnalysisException: Table or view not found: t2;; 'UnresolvedRelation [t2], [], false at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis$1(CheckAnalysis.scala:113) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis$1$adapted(CheckAnalysis.scala:93) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:183) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.checkAnalysis(CheckAnalysis.scala:93) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.checkAnalysis$(CheckAnalysis.scala:90) at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:152) at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:172) at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:214) at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:169) at org.apache.spark.sql.execution.QueryExecution.$anonfun$analyzed$1(QueryExecution.scala:73) at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111) at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:138) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:768) at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:138) at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:73) at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:71) at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:63) at org.apache.spark.sql.Dataset$.$anonfun$ofRows$1(Dataset.scala:90) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:768) at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:88) at org.apache.spark.sql.DataFrameReader.table(DataFrameReader.scala:889) at org.apache.spark.sql.SparkSession.table(SparkSession.scala:589) at org.apache.spark.sql.internal.CatalogImpl.uncacheTable(CatalogImpl.scala:476) at org.apache.spark.sql.execution.command.CommandUtils$.uncacheTableOrView(CommandUtils.scala:392) at org.apache.spark.sql.execution.command.CreateViewCommand.run(views.scala:124) ``` Since `t2` does not exist yet, it shouldn't try to uncache. ### Why are the changes needed? To fix misleading message. ### Does this PR introduce _any_ user-facing change? Yes, the above message will not be displayed if the view doesn't exist yet. ### How was this patch tested? Manually tested since this is a log message printed. Closes #30608 from imback82/fix_cache_message. Authored-by: Terry Kim Signed-off-by: HyukjinKwon --- .../spark/sql/execution/command/views.scala | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala index 4ad5eddb83f43..06b1e03adea50 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala @@ -113,12 +113,12 @@ case class CreateViewCommand( verifyTemporaryObjectsNotExists(catalog, isTemporary, name, child) if (viewType == LocalTempView) { - val samePlan = catalog.getTempView(name.table).exists { - // Don't perform sameResult check for View logical plan, since it's unresolved - case _: View => false - case other => other.sameResult(child) + val shouldUncache = replace && catalog.getTempView(name.table).exists { + // Uncache View logical plan without checking the same result check, since it's unresolved. + case _: View => true + case other => !other.sameResult(child) } - if (replace && !samePlan) { + if (shouldUncache) { logInfo(s"Try to uncache ${name.quotedString} before replacing.") checkCyclicViewReference(analyzedPlan, Seq(name), name) CommandUtils.uncacheTableOrView(sparkSession, name.quotedString) @@ -141,12 +141,12 @@ case class CreateViewCommand( } else if (viewType == GlobalTempView) { val db = sparkSession.sessionState.conf.getConf(StaticSQLConf.GLOBAL_TEMP_DATABASE) val viewIdent = TableIdentifier(name.table, Option(db)) - val samePlan = catalog.getGlobalTempView(name.table).exists { - // Don't perform sameResult check for View logical plan, since it's unresolved - case _: View => false - case other => other.sameResult(child) + val shouldUncache = replace && catalog.getGlobalTempView(name.table).exists { + // Uncache View logical plan without checking the same result check, since it's unresolved. + case _: View => true + case other => !other.sameResult(child) } - if (replace && !samePlan) { + if (shouldUncache) { logInfo(s"Try to uncache ${viewIdent.quotedString} before replacing.") checkCyclicViewReference(analyzedPlan, Seq(viewIdent), viewIdent) CommandUtils.uncacheTableOrView(sparkSession, viewIdent.quotedString) From e32de29bcee6073a2d2b9bb4e5930459eaf460d9 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Mon, 7 Dec 2020 10:05:28 +0900 Subject: [PATCH 121/150] [SPARK-33675][INFRA] Add GitHub Action job to publish snapshot ### What changes were proposed in this pull request? This PR aims to add `GitHub Action` job to publish daily snapshot for **master** branch. - https://repository.apache.org/content/groups/snapshots/org/apache/spark/spark-core_2.12/3.2.0-SNAPSHOT/ For the other branches, I'll make adjusted backports. - For `branch-3.1`, we can specify the checkout `ref` to `branch-3.1`. - For `branch-2.4` and `branch-3.0`, we can publish at every commit since the traffic is low. - https://github.com/apache/spark/pull/30630 (branch-3.0) - https://github.com/apache/spark/pull/30629 (branch-2.4 LTS) ### Why are the changes needed? After this series of jobs, this will reduce our maintenance burden permanently from AmpLab Jenkins by removing the following completely. https://amplab.cs.berkeley.edu/jenkins/view/Spark%20Packaging/ For now, AmpLab Jenkins doesn't have a job for `branch-3.1`. We can do it by ourselves by `GitHub Action`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? The snapshot publishing is tested here at PR trigger. Since this PR adds a scheduled job, we cannot test in this PR. - https://github.com/dongjoon-hyun/spark/runs/1505792859 Apache Infra team finished the setup here. - https://issues.apache.org/jira/browse/INFRA-21167 Closes #30623 from dongjoon-hyun/SPARK-33675. Authored-by: Dongjoon Hyun Signed-off-by: HyukjinKwon --- .github/workflows/publish_snapshot.yml | 30 ++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 .github/workflows/publish_snapshot.yml diff --git a/.github/workflows/publish_snapshot.yml b/.github/workflows/publish_snapshot.yml new file mode 100644 index 0000000000000..9871680f73891 --- /dev/null +++ b/.github/workflows/publish_snapshot.yml @@ -0,0 +1,30 @@ +name: Publish Snapshot + +on: + schedule: + - cron: '0 0 * * *' + +jobs: + publish-snapshot: + runs-on: ubuntu-latest + steps: + - name: Checkout Spark repository + uses: actions/checkout@master + - name: Cache Maven local repository + uses: actions/cache@v2 + with: + path: ~/.m2/repository + key: snapshot-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + snapshot-maven- + - name: Install Java 8 + uses: actions/setup-java@v1 + with: + java-version: 8 + - name: Publish snapshot + env: + ASF_USERNAME: ${{ secrets.NEXUS_USER }} + ASF_PASSWORD: ${{ secrets.NEXUS_PW }} + GPG_KEY: "not_used" + GPG_PASSPHRASE: "not_used" + run: ./dev/create-release/release-build.sh publish-snapshot From 29096a8869c95221dc75ce7fd3d098680bef4f55 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Mon, 7 Dec 2020 10:21:04 +0900 Subject: [PATCH 122/150] [SPARK-33670][SQL] Verify the partition provider is Hive in v1 SHOW TABLE EXTENDED ### What changes were proposed in this pull request? Invoke the check `DDLUtils.verifyPartitionProviderIsHive()` from V1 implementation of `SHOW TABLE EXTENDED` when partition specs are specified. This PR is some kind of follow up https://github.com/apache/spark/pull/16373 and https://github.com/apache/spark/pull/15515. ### Why are the changes needed? To output an user friendly error with recommendation like **" ... partition metadata is not stored in the Hive metastore. To import this information into the metastore, run `msck repair table tableName` "** instead of silently output an empty result. ### Does this PR introduce _any_ user-facing change? Yes. ### How was this patch tested? By running the affected test suites, in particular: ``` $ build/sbt -Phive-2.3 -Phive-thriftserver "hive/test:testOnly *PartitionProviderCompatibilitySuite" ``` Closes #30618 from MaxGekk/show-table-extended-verifyPartitionProviderIsHive. Authored-by: Max Gekk Signed-off-by: HyukjinKwon --- .../spark/sql/execution/command/tables.scala | 3 +++ .../execution/command/v1/ShowTablesSuite.scala | 18 ++++++++++++++++-- .../PartitionProviderCompatibilitySuite.scala | 14 ++++++++++---- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 59adb7dd7e319..54660ced8d834 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -879,6 +879,9 @@ case class ShowTablesCommand( // Note: tableIdentifierPattern should be non-empty, otherwise a [[ParseException]] // should have been thrown by the sql parser. val table = catalog.getTableMetadata(TableIdentifier(tableIdentifierPattern.get, Some(db))) + + DDLUtils.verifyPartitionProviderIsHive(sparkSession, table, "SHOW TABLE EXTENDED") + val tableIdent = table.identifier val normalizedSpec = PartitioningUtils.normalizePartitionSpec( partitionSpec.get, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowTablesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowTablesSuite.scala index 8f29f9f276138..3db880c776365 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowTablesSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowTablesSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.command.v1 -import org.apache.spark.sql.{AnalysisException, Row} +import org.apache.spark.sql.{AnalysisException, Row, SaveMode} import org.apache.spark.sql.connector.catalog.CatalogManager import org.apache.spark.sql.execution.command import org.apache.spark.sql.internal.SQLConf @@ -111,4 +111,18 @@ trait ShowTablesSuiteBase extends command.ShowTablesSuiteBase { } } -class ShowTablesSuite extends ShowTablesSuiteBase with SharedSparkSession +class ShowTablesSuite extends ShowTablesSuiteBase with SharedSparkSession { + test("SPARK-33670: show partitions from a datasource table") { + import testImplicits._ + withNamespace(s"$catalog.ns") { + sql(s"CREATE NAMESPACE $catalog.ns") + sql(s"USE $catalog.ns") + val t = "part_datasrc" + withTable(t) { + val df = (1 to 3).map(i => (i, s"val_$i", i * 2)).toDF("a", "b", "c") + df.write.partitionBy("a").format("parquet").mode(SaveMode.Overwrite).saveAsTable(t) + assert(sql(s"SHOW TABLE EXTENDED LIKE '$t' PARTITION(a = 1)").count() === 1) + } + } + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala index 80afc9d8f44bc..e1b0637963b75 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala @@ -53,7 +53,8 @@ class PartitionProviderCompatibilitySuite s"ALTER TABLE $tableName PARTITION (partCol=1) SET LOCATION '/foo'", s"ALTER TABLE $tableName DROP PARTITION (partCol=1)", s"DESCRIBE $tableName PARTITION (partCol=1)", - s"SHOW PARTITIONS $tableName") + s"SHOW PARTITIONS $tableName", + s"SHOW TABLE EXTENDED LIKE '$tableName' PARTITION (partCol=1)") withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") { for (cmd <- unsupportedCommands) { @@ -124,10 +125,15 @@ class PartitionProviderCompatibilitySuite } // disabled withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") { - val e = intercept[AnalysisException] { - spark.sql(s"show partitions test") + Seq( + "SHOW PARTITIONS test", + "SHOW TABLE EXTENDED LIKE 'test' PARTITION (partCol=1)" + ).foreach { showPartitions => + val e = intercept[AnalysisException] { + spark.sql(showPartitions) + } + assert(e.getMessage.contains("filesource partition management is disabled")) } - assert(e.getMessage.contains("filesource partition management is disabled")) spark.sql("refresh table test") assert(spark.sql("select * from test").count() == 5) } From e88f0d4a2436cc47c8bf8ed2a739eab728ea3d81 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Sun, 6 Dec 2020 17:57:19 -0800 Subject: [PATCH 123/150] [SPARK-33683][INFRA] Remove -Djava.version=11 from Scala 2.13 build in GitHub Actions ### What changes were proposed in this pull request? This PR removes `-Djava.version=11` from the build command for Scala 2.13 in the GitHub Actions' job. In the GitHub Actions' job, the build command for Scala 2.13 is defined as follows. ``` ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Djava.version=11 -Pscala-2.13 compile test:compile ``` Though, Scala 2.13 build uses Java 8 rather than 11 so let's remove `-Djava.version=11`. ### Why are the changes needed? To build with consistent configuration. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Should be done by GitHub Actions' workflow. Closes #30633 from sarutak/scala-213-java11. Authored-by: Kousuke Saruta Signed-off-by: Dongjoon Hyun --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a3bb083387f3e..72b2caf907151 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -414,7 +414,7 @@ jobs: - name: Build with SBT run: | ./dev/change-scala-version.sh 2.13 - ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Djava.version=11 -Pscala-2.13 compile test:compile + ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pscala-2.13 compile test:compile hadoop-2: name: Hadoop 2 build with SBT From 73412ffb3a857acda5dab41d7be3f7ae627f6eaf Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Sun, 6 Dec 2020 19:34:54 -0800 Subject: [PATCH 124/150] [SPARK-33680][SQL][TESTS] Fix PrunePartitionSuiteBase/BucketedReadWithHiveSupportSuite not to depend on the default conf ### What changes were proposed in this pull request? This PR updates `PrunePartitionSuiteBase/BucketedReadWithHiveSupportSuite` to have the require conf explicitly. ### Why are the changes needed? The unit test should not depend on the default configurations. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? According to https://github.com/apache/spark/pull/30628 , this seems to be the only ones. Pass the CIs. Closes #30631 from dongjoon-hyun/SPARK-CONF-AGNO. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun --- .../execution/PrunePartitionSuiteBase.scala | 81 ++++++++++--------- .../BucketedReadWithHiveSupportSuite.scala | 4 +- 2 files changed, 45 insertions(+), 40 deletions(-) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PrunePartitionSuiteBase.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PrunePartitionSuiteBase.scala index 8e35cd034311d..bc170fcd59026 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PrunePartitionSuiteBase.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PrunePartitionSuiteBase.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.expressions.{AttributeReference, BinaryOperator, Expression, IsNotNull, Literal} import org.apache.spark.sql.execution.{FileSourceScanExec, SparkPlan} import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.internal.SQLConf.ADAPTIVE_EXECUTION_ENABLED import org.apache.spark.sql.test.SQLTestUtils abstract class PrunePartitionSuiteBase extends QueryTest with SQLTestUtils with TestHiveSingleton { @@ -28,48 +29,50 @@ abstract class PrunePartitionSuiteBase extends QueryTest with SQLTestUtils with protected def format: String test("SPARK-28169: Convert scan predicate condition to CNF") { - withTempView("temp") { - withTable("t") { - sql( - s""" - |CREATE TABLE t(i INT, p STRING) - |USING $format - |PARTITIONED BY (p)""".stripMargin) - - spark.range(0, 1000, 1).selectExpr("id as col") - .createOrReplaceTempView("temp") - - for (part <- Seq(1, 2, 3, 4)) { + withSQLConf(ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + withTempView("temp") { + withTable("t") { sql( s""" - |INSERT OVERWRITE TABLE t PARTITION (p='$part') - |SELECT col FROM temp""".stripMargin) - } + |CREATE TABLE t(i INT, p STRING) + |USING $format + |PARTITIONED BY (p)""".stripMargin) - assertPrunedPartitions( - "SELECT * FROM t WHERE p = '1' OR (p = '2' AND i = 1)", 2, - "((`p` = '1') || (`p` = '2'))") - assertPrunedPartitions( - "SELECT * FROM t WHERE (p = '1' AND i = 2) OR (i = 1 OR p = '2')", 4, - "") - assertPrunedPartitions( - "SELECT * FROM t WHERE (p = '1' AND i = 2) OR (p = '3' AND i = 3 )", 2, - "((`p` = '1') || (`p` = '3'))") - assertPrunedPartitions( - "SELECT * FROM t WHERE (p = '1' AND i = 2) OR (p = '2' OR p = '3')", 3, - "((`p` = '1') || ((`p` = '2') || (`p` = '3')))") - assertPrunedPartitions( - "SELECT * FROM t", 4, - "") - assertPrunedPartitions( - "SELECT * FROM t WHERE p = '1' AND i = 2", 1, - "(`p` = '1')") - assertPrunedPartitions( - """ - |SELECT i, COUNT(1) FROM ( - |SELECT * FROM t WHERE p = '1' OR (p = '2' AND i = 1) - |) tmp GROUP BY i - """.stripMargin, 2, "((`p` = '1') || (`p` = '2'))") + spark.range(0, 1000, 1).selectExpr("id as col") + .createOrReplaceTempView("temp") + + for (part <- Seq(1, 2, 3, 4)) { + sql( + s""" + |INSERT OVERWRITE TABLE t PARTITION (p='$part') + |SELECT col FROM temp""".stripMargin) + } + + assertPrunedPartitions( + "SELECT * FROM t WHERE p = '1' OR (p = '2' AND i = 1)", 2, + "((`p` = '1') || (`p` = '2'))") + assertPrunedPartitions( + "SELECT * FROM t WHERE (p = '1' AND i = 2) OR (i = 1 OR p = '2')", 4, + "") + assertPrunedPartitions( + "SELECT * FROM t WHERE (p = '1' AND i = 2) OR (p = '3' AND i = 3 )", 2, + "((`p` = '1') || (`p` = '3'))") + assertPrunedPartitions( + "SELECT * FROM t WHERE (p = '1' AND i = 2) OR (p = '2' OR p = '3')", 3, + "((`p` = '1') || ((`p` = '2') || (`p` = '3')))") + assertPrunedPartitions( + "SELECT * FROM t", 4, + "") + assertPrunedPartitions( + "SELECT * FROM t WHERE p = '1' AND i = 2", 1, + "(`p` = '1')") + assertPrunedPartitions( + """ + |SELECT i, COUNT(1) FROM ( + |SELECT * FROM t WHERE p = '1' OR (p = '2' AND i = 1) + |) tmp GROUP BY i + """.stripMargin, 2, "((`p` = '1') || (`p` = '2'))") + } } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadWithHiveSupportSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadWithHiveSupportSuite.scala index 35dab79ff6dff..07901351fc0fc 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadWithHiveSupportSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/BucketedReadWithHiveSupportSuite.scala @@ -17,10 +17,12 @@ package org.apache.spark.sql.sources +import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecutionSuite import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION -class BucketedReadWithHiveSupportSuite extends BucketedReadSuite with TestHiveSingleton { +class BucketedReadWithHiveSupportSuite + extends BucketedReadSuite with DisableAdaptiveExecutionSuite with TestHiveSingleton { protected override def beforeAll(): Unit = { super.beforeAll() assert(spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "hive") From d48ef34911b8928b66df92399119caebb24616d4 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Sun, 6 Dec 2020 23:02:36 -0800 Subject: [PATCH 125/150] [SPARK-33684][BUILD] Upgrade httpclient from 4.5.6 to 4.5.13 ### What changes were proposed in this pull request? This PR upgrades `commons.httpclient` from `4.5.6` to `4.5.13`. 4.5.6 is released over 2 years ago and now we can use more stable `4.5.13`. https://archive.apache.org/dist/httpcomponents/httpclient/RELEASE_NOTES-4.5.x.txt ### Why are the changes needed? To follow the more stable release. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Should be done by the existing tests. Closes #30634 from sarutak/upgrade-httpclient. Authored-by: Kousuke Saruta Signed-off-by: Dongjoon Hyun --- dev/deps/spark-deps-hadoop-2.7-hive-2.3 | 2 +- dev/deps/spark-deps-hadoop-3.2-hive-2.3 | 2 +- pom.xml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 index a19558bc2a5e3..401050a60e493 100644 --- a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 @@ -99,7 +99,7 @@ hk2-api/2.6.1//hk2-api-2.6.1.jar hk2-locator/2.6.1//hk2-locator-2.6.1.jar hk2-utils/2.6.1//hk2-utils-2.6.1.jar htrace-core/3.1.0-incubating//htrace-core-3.1.0-incubating.jar -httpclient/4.5.6//httpclient-4.5.6.jar +httpclient/4.5.13//httpclient-4.5.13.jar httpcore/4.4.12//httpcore-4.4.12.jar istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar ivy/2.4.0//ivy-2.4.0.jar diff --git a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 index 24283224dd37d..b0f8935843281 100644 --- a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 @@ -98,7 +98,7 @@ hk2-api/2.6.1//hk2-api-2.6.1.jar hk2-locator/2.6.1//hk2-locator-2.6.1.jar hk2-utils/2.6.1//hk2-utils-2.6.1.jar htrace-core4/4.1.0-incubating//htrace-core4-4.1.0-incubating.jar -httpclient/4.5.6//httpclient-4.5.6.jar +httpclient/4.5.13//httpclient-4.5.13.jar httpcore/4.4.12//httpcore-4.4.12.jar istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar ivy/2.4.0//ivy-2.4.0.jar diff --git a/pom.xml b/pom.xml index 1d7704055898b..364dec688b38b 100644 --- a/pom.xml +++ b/pom.xml @@ -155,7 +155,7 @@ 0.12.8 - 4.5.6 + 4.5.13 4.4.12 3.1 From 87c056088e853d475f1507e296ad06480862e8a7 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Sun, 6 Dec 2020 23:22:52 -0800 Subject: [PATCH 126/150] [SPARK-33671][SQL] Remove VIEW checks from V1 table commands ### What changes were proposed in this pull request? Remove VIEW checks from the following V1 commands: - `SHOW PARTITIONS` - `TRUNCATE TABLE` - `LOAD DATA` The checks are performed earlier at: https://github.com/apache/spark/blob/acc211d2cf0e6ab94f6578e1eb488766fd20fa4e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala#L885-L889 ### Why are the changes needed? To improve code maintenance, and remove dead codes. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? By existing test suites like `v1/ShowPartitionsSuite`. 1. LOAD DATA: https://github.com/apache/spark/blob/acc211d2cf0e6ab94f6578e1eb488766fd20fa4e/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala#L176-L179 2. TRUNCATE TABLE: https://github.com/apache/spark/blob/acc211d2cf0e6ab94f6578e1eb488766fd20fa4e/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala#L180-L183 3. SHOW PARTITIONS: - v1/ShowPartitionsSuite Closes #30620 from MaxGekk/show-table-check-view. Authored-by: Max Gekk Signed-off-by: Dongjoon Hyun --- .../apache/spark/sql/execution/command/tables.scala | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 54660ced8d834..640051384e94c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -311,9 +311,6 @@ case class LoadDataCommand( sparkSession.sessionState.conf.resolver) } - if (targetTable.tableType == CatalogTableType.VIEW) { - throw new AnalysisException(s"Target table in LOAD DATA cannot be a view: $tableIdentwithDB") - } if (DDLUtils.isDatasourceTable(targetTable)) { throw new AnalysisException( s"LOAD DATA is not supported for datasource tables: $tableIdentwithDB") @@ -452,10 +449,6 @@ case class TruncateTableCommand( throw new AnalysisException( s"Operation not allowed: TRUNCATE TABLE on external tables: $tableIdentWithDB") } - if (table.tableType == CatalogTableType.VIEW) { - throw new AnalysisException( - s"Operation not allowed: TRUNCATE TABLE on views: $tableIdentWithDB") - } if (table.partitionColumnNames.isEmpty && partitionSpec.isDefined) { throw new AnalysisException( s"Operation not allowed: TRUNCATE TABLE ... PARTITION is not supported " + @@ -995,11 +988,7 @@ case class ShowPartitionsCommand( * Validate and throws an [[AnalysisException]] exception under the following conditions: * 1. If the table is not partitioned. * 2. If it is a datasource table. - * 3. If it is a view. */ - if (table.tableType == VIEW) { - throw new AnalysisException(s"SHOW PARTITIONS is not allowed on a view: $tableIdentWithDB") - } if (table.partitionColumnNames.isEmpty) { throw new AnalysisException( From 26c0493318c2a3e5b74ff3829de88605aff8e832 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Mon, 7 Dec 2020 08:14:36 +0000 Subject: [PATCH 127/150] [SPARK-33676][SQL] Require exact matching of partition spec to the schema in V2 `ALTER TABLE .. ADD/DROP PARTITION` ### What changes were proposed in this pull request? Check that partitions specs passed to v2 `ALTER TABLE .. ADD/DROP PARTITION` exactly match to the partition schema (all partition fields from the schema are specified in partition specs). ### Why are the changes needed? 1. To have the same behavior as V1 `ALTER TABLE .. ADD/DROP PARTITION` that output the error: ```sql spark-sql> create table tab1 (id int, a int, b int) using parquet partitioned by (a, b); spark-sql> ALTER TABLE tab1 ADD PARTITION (A='9'); Error in query: Partition spec is invalid. The spec (a) must match the partition spec (a, b) defined in table '`default`.`tab1`'; ``` 2. To prevent future errors caused by not fully specified partition specs. ### Does this PR introduce _any_ user-facing change? Yes. The V2 implementation of `ALTER TABLE .. ADD/DROP PARTITION` output the same error as V1 commands. ### How was this patch tested? By running the test suite with new UT: ``` $ build/sbt "test:testOnly *AlterTablePartitionV2SQLSuite" ``` Closes #30624 from MaxGekk/add-partition-full-spec. Authored-by: Max Gekk Signed-off-by: Wenchen Fan --- .../analysis/ResolvePartitionSpec.scala | 20 +++++++++++++++---- .../sql/catalyst/catalog/SessionCatalog.scala | 15 ++++++-------- .../spark/sql/util/PartitioningUtils.scala | 18 +++++++++++++++++ .../AlterTablePartitionV2SQLSuite.scala | 20 +++++++++++++++++++ 4 files changed, 60 insertions(+), 13 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolvePartitionSpec.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolvePartitionSpec.scala index 38991a9e24fa8..feb05d3b6926b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolvePartitionSpec.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolvePartitionSpec.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.connector.catalog.SupportsPartitionManagement import org.apache.spark.sql.types._ -import org.apache.spark.sql.util.PartitioningUtils.normalizePartitionSpec +import org.apache.spark.sql.util.PartitioningUtils.{normalizePartitionSpec, requireExactMatchedPartitionSpec} /** * Resolve [[UnresolvedPartitionSpec]] to [[ResolvedPartitionSpec]] in partition related commands. @@ -35,11 +35,21 @@ object ResolvePartitionSpec extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case r @ AlterTableAddPartition( ResolvedTable(_, _, table: SupportsPartitionManagement), partSpecs, _) => - r.copy(parts = resolvePartitionSpecs(table.name, partSpecs, table.partitionSchema())) + val partitionSchema = table.partitionSchema() + r.copy(parts = resolvePartitionSpecs( + table.name, + partSpecs, + partitionSchema, + requireExactMatchedPartitionSpec(table.name, _, partitionSchema.fieldNames))) case r @ AlterTableDropPartition( ResolvedTable(_, _, table: SupportsPartitionManagement), partSpecs, _, _, _) => - r.copy(parts = resolvePartitionSpecs(table.name, partSpecs, table.partitionSchema())) + val partitionSchema = table.partitionSchema() + r.copy(parts = resolvePartitionSpecs( + table.name, + partSpecs, + partitionSchema, + requireExactMatchedPartitionSpec(table.name, _, partitionSchema.fieldNames))) case r @ ShowPartitions(ResolvedTable(_, _, table: SupportsPartitionManagement), partSpecs) => r.copy(pattern = resolvePartitionSpecs( @@ -51,7 +61,8 @@ object ResolvePartitionSpec extends Rule[LogicalPlan] { private def resolvePartitionSpecs( tableName: String, partSpecs: Seq[PartitionSpec], - partSchema: StructType): Seq[ResolvedPartitionSpec] = + partSchema: StructType, + checkSpec: TablePartitionSpec => Unit = _ => ()): Seq[ResolvedPartitionSpec] = partSpecs.map { case unresolvedPartSpec: UnresolvedPartitionSpec => val normalizedSpec = normalizePartitionSpec( @@ -59,6 +70,7 @@ object ResolvePartitionSpec extends Rule[LogicalPlan] { partSchema.map(_.name), tableName, conf.resolver) + checkSpec(normalizedSpec) val partitionNames = normalizedSpec.keySet val requestedFields = partSchema.filter(field => partitionNames.contains(field.name)) ResolvedPartitionSpec( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 0cdbc1a234c66..a2ab756382488 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -43,7 +43,7 @@ import org.apache.spark.sql.connector.catalog.CatalogManager import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.GLOBAL_TEMP_DATABASE import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.sql.util.{CaseInsensitiveStringMap, PartitioningUtils} import org.apache.spark.util.Utils object SessionCatalog { @@ -1167,14 +1167,11 @@ class SessionCatalog( private def requireExactMatchedPartitionSpec( specs: Seq[TablePartitionSpec], table: CatalogTable): Unit = { - val defined = table.partitionColumnNames.sorted - specs.foreach { s => - if (s.keys.toSeq.sorted != defined) { - throw new AnalysisException( - s"Partition spec is invalid. The spec (${s.keys.mkString(", ")}) must match " + - s"the partition spec (${table.partitionColumnNames.mkString(", ")}) defined in " + - s"table '${table.identifier}'") - } + specs.foreach { spec => + PartitioningUtils.requireExactMatchedPartitionSpec( + table.identifier.toString, + spec, + table.partitionColumnNames) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/PartitioningUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/PartitioningUtils.scala index 586aa6c59164f..e473e1d1b7ff3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/PartitioningUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/PartitioningUtils.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.util import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.Resolver +import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec object PartitioningUtils { /** @@ -44,4 +45,21 @@ object PartitioningUtils { normalizedPartSpec.toMap } + + /** + * Verify if the input partition spec exactly matches the existing defined partition spec + * The columns must be the same but the orders could be different. + */ + def requireExactMatchedPartitionSpec( + tableName: String, + spec: TablePartitionSpec, + partitionColumnNames: Seq[String]): Unit = { + val defined = partitionColumnNames.sorted + if (spec.keys.toSeq.sorted != defined) { + throw new AnalysisException( + s"Partition spec is invalid. The spec (${spec.keys.mkString(", ")}) must match " + + s"the partition spec (${partitionColumnNames.mkString(", ")}) defined in " + + s"table '$tableName'") + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTablePartitionV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTablePartitionV2SQLSuite.scala index 47b5e5e54edde..45d47c6d8681c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTablePartitionV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTablePartitionV2SQLSuite.scala @@ -261,4 +261,24 @@ class AlterTablePartitionV2SQLSuite extends DatasourceV2SQLBase { } } } + + test("SPARK-33676: not fully specified partition spec") { + val t = "testpart.ns1.ns2.tbl" + withTable(t) { + sql(s""" + |CREATE TABLE $t (id bigint, part0 int, part1 string) + |USING foo + |PARTITIONED BY (part0, part1)""".stripMargin) + Seq( + s"ALTER TABLE $t ADD PARTITION (part0 = 1)", + s"ALTER TABLE $t DROP PARTITION (part0 = 1)" + ).foreach { alterTable => + val errMsg = intercept[AnalysisException] { + sql(alterTable) + }.getMessage + assert(errMsg.contains("Partition spec is invalid. " + + "The spec (part0) must match the partition spec (part0, part1)")) + } + } + } } From 1e0c006748c031d5277ba3b906b0bbf68e6bc893 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Mon, 7 Dec 2020 21:36:52 +0900 Subject: [PATCH 128/150] [SPARK-33617][SQL] Add default parallelism configuration for Spark SQL queries ### What changes were proposed in this pull request? This pr add default parallelism configuration(`spark.sql.default.parallelism`) for Spark SQL and make it effective for `LocalTableScan`. ### Why are the changes needed? Avoid generating small files for INSERT INTO TABLE from VALUES, for example: ```sql CREATE TABLE t1(id int) USING parquet; INSERT INTO TABLE t1 VALUES (1), (2), (3), (4), (5), (6), (7), (8); ``` Before this pr: ``` -rw-r--r-- 1 root root 421 Dec 1 01:54 part-00000-4d5a3a89-2995-4328-b2ae-908febbbaf4a-c000.snappy.parquet -rw-r--r-- 1 root root 421 Dec 1 01:54 part-00001-4d5a3a89-2995-4328-b2ae-908febbbaf4a-c000.snappy.parquet -rw-r--r-- 1 root root 421 Dec 1 01:54 part-00002-4d5a3a89-2995-4328-b2ae-908febbbaf4a-c000.snappy.parquet -rw-r--r-- 1 root root 421 Dec 1 01:54 part-00003-4d5a3a89-2995-4328-b2ae-908febbbaf4a-c000.snappy.parquet -rw-r--r-- 1 root root 421 Dec 1 01:54 part-00004-4d5a3a89-2995-4328-b2ae-908febbbaf4a-c000.snappy.parquet -rw-r--r-- 1 root root 421 Dec 1 01:54 part-00005-4d5a3a89-2995-4328-b2ae-908febbbaf4a-c000.snappy.parquet -rw-r--r-- 1 root root 421 Dec 1 01:54 part-00006-4d5a3a89-2995-4328-b2ae-908febbbaf4a-c000.snappy.parquet -rw-r--r-- 1 root root 421 Dec 1 01:54 part-00007-4d5a3a89-2995-4328-b2ae-908febbbaf4a-c000.snappy.parquet -rw-r--r-- 1 root root 0 Dec 1 01:54 _SUCCESS ``` After this pr and set `spark.sql.files.minPartitionNum` to 1: ``` -rw-r--r-- 1 root root 452 Dec 1 01:59 part-00000-6de50c79-e305-4f8d-b6ae-39f46b2619c6-c000.snappy.parquet -rw-r--r-- 1 root root 0 Dec 1 01:59 _SUCCESS ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit test. Closes #30559 from wangyum/SPARK-33617. Lead-authored-by: Yuming Wang Co-authored-by: Yuming Wang Signed-off-by: HyukjinKwon --- .../scala/org/apache/spark/sql/internal/SQLConf.scala | 10 ++++++++++ .../main/scala/org/apache/spark/sql/SparkSession.scala | 6 ++++-- .../spark/sql/execution/LocalTableScanExec.scala | 4 +++- .../execution/adaptive/CoalesceShufflePartitions.scala | 2 +- .../spark/sql/execution/basicPhysicalOperators.scala | 3 ++- .../org/apache/spark/sql/execution/command/ddl.scala | 3 ++- .../sql/execution/datasources/FilePartition.scala | 1 + .../sql/execution/datasources/SchemaMergeUtils.scala | 3 ++- .../apache/spark/sql/execution/SparkPlanSuite.scala | 9 +++++++++ 9 files changed, 34 insertions(+), 7 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 025478214e492..ea30832008b56 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -374,6 +374,14 @@ object SQLConf { .booleanConf .createWithDefault(true) + val DEFAULT_PARALLELISM = buildConf("spark.sql.default.parallelism") + .doc("The number of parallelism for Spark SQL, the default value is " + + "`spark.default.parallelism`.") + .version("3.2.0") + .intConf + .checkValue(_ > 0, "The value of spark.sql.default.parallelism must be positive.") + .createOptional + val SHUFFLE_PARTITIONS = buildConf("spark.sql.shuffle.partitions") .doc("The default number of partitions to use when shuffling data for joins or aggregations. " + "Note: For structured streaming, this configuration cannot be changed between query " + @@ -3160,6 +3168,8 @@ class SQLConf extends Serializable with Logging { def cacheVectorizedReaderEnabled: Boolean = getConf(CACHE_VECTORIZED_READER_ENABLED) + def defaultParallelism: Option[Int] = getConf(DEFAULT_PARALLELISM) + def defaultNumShufflePartitions: Int = getConf(SHUFFLE_PARTITIONS) def numShufflePartitions: Int = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index db5ad52977c71..3a9b06940b769 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -519,7 +519,8 @@ class SparkSession private( * @since 2.0.0 */ def range(start: Long, end: Long): Dataset[java.lang.Long] = { - range(start, end, step = 1, numPartitions = sparkContext.defaultParallelism) + range(start, end, step = 1, + numPartitions = sqlContext.conf.defaultParallelism.getOrElse(sparkContext.defaultParallelism)) } /** @@ -529,7 +530,8 @@ class SparkSession private( * @since 2.0.0 */ def range(start: Long, end: Long, step: Long): Dataset[java.lang.Long] = { - range(start, end, step, numPartitions = sparkContext.defaultParallelism) + range(start, end, step, + numPartitions = sqlContext.conf.defaultParallelism.getOrElse(sparkContext.defaultParallelism)) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala index b452213cd6cc7..02a8f46824241 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala @@ -49,7 +49,9 @@ case class LocalTableScanExec( if (rows.isEmpty) { sqlContext.sparkContext.emptyRDD } else { - val numSlices = math.min(unsafeRows.length, sqlContext.sparkContext.defaultParallelism) + val numSlices = math.min( + unsafeRows.length, + conf.defaultParallelism.getOrElse(sqlContext.sparkContext.defaultParallelism)) sqlContext.sparkContext.parallelize(unsafeRows, numSlices) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala index 0f482142227d2..6149bd214e540 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala @@ -67,7 +67,7 @@ case class CoalesceShufflePartitions(session: SparkSession) extends CustomShuffl // We fall back to Spark default parallelism if the minimum number of coalesced partitions // is not set, so to avoid perf regressions compared to no coalescing. val minPartitionNum = conf.getConf(SQLConf.COALESCE_PARTITIONS_MIN_PARTITION_NUM) - .getOrElse(session.sparkContext.defaultParallelism) + .orElse(conf.defaultParallelism).getOrElse(session.sparkContext.defaultParallelism) val partitionSpecs = ShufflePartitionsUtil.coalescePartitions( validMetrics.toArray, advisoryTargetSize = conf.getConf(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES), diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala index 006fa0fba4138..80a4090ce03f3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala @@ -382,7 +382,8 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) val start: Long = range.start val end: Long = range.end val step: Long = range.step - val numSlices: Int = range.numSlices.getOrElse(sparkContext.defaultParallelism) + val numSlices: Int = range.numSlices.orElse(sqlContext.conf.defaultParallelism) + .getOrElse(sparkContext.defaultParallelism) val numElements: BigInt = range.numElements val isEmptyRange: Boolean = start == end || (start < end ^ 0 < step) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala index 27ad62026c9b5..69425cfed285f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala @@ -738,7 +738,8 @@ case class AlterTableRecoverPartitionsCommand( // Set the number of parallelism to prevent following file listing from generating many tasks // in case of large #defaultParallelism. val numParallelism = Math.min(serializedPaths.length, - Math.min(spark.sparkContext.defaultParallelism, 10000)) + Math.min(spark.sessionState.conf.defaultParallelism + .getOrElse(spark.sparkContext.defaultParallelism), 10000)) // gather the fast stats for all the partitions otherwise Hive metastore will list all the // files for all the new partitions in sequential way, which is super slow. logInfo(s"Gather the fast stats in parallel using $numParallelism tasks.") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala index 864130bbd87b7..1b35db8d0873c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala @@ -89,6 +89,7 @@ object FilePartition extends Logging { val defaultMaxSplitBytes = sparkSession.sessionState.conf.filesMaxPartitionBytes val openCostInBytes = sparkSession.sessionState.conf.filesOpenCostInBytes val minPartitionNum = sparkSession.sessionState.conf.filesMinPartitionNum + .orElse(sparkSession.sessionState.conf.defaultParallelism) .getOrElse(sparkSession.sparkContext.defaultParallelism) val totalBytes = selectedPartitions.flatMap(_.files.map(_.getLen + openCostInBytes)).sum val bytesPerCore = totalBytes / minPartitionNum diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaMergeUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaMergeUtils.scala index 28097c35401c9..54d79898bb81b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaMergeUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaMergeUtils.scala @@ -57,7 +57,8 @@ object SchemaMergeUtils extends Logging { // Set the number of partitions to prevent following schema reads from generating many tasks // in case of a small number of orc files. val numParallelism = Math.min(Math.max(partialFileStatusInfo.size, 1), - sparkSession.sparkContext.defaultParallelism) + sparkSession.sessionState.conf.defaultParallelism + .getOrElse(sparkSession.sparkContext.defaultParallelism)) val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala index 56fff1107ae39..254855247ced3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala @@ -88,4 +88,13 @@ class SparkPlanSuite extends QueryTest with SharedSparkSession { test("SPARK-30780 empty LocalTableScan should use RDD without partitions") { assert(LocalTableScanExec(Nil, Nil).execute().getNumPartitions == 0) } + + test("SPARK-33617: spark.sql.default.parallelism effective for LocalTableScan") { + Seq(1, 4).foreach { minPartitionNum => + withSQLConf(SQLConf.DEFAULT_PARALLELISM.key -> minPartitionNum.toString) { + val df = spark.sql("SELECT * FROM VALUES (1), (2), (3), (4), (5), (6), (7), (8)") + assert(df.rdd.partitions.length === minPartitionNum) + } + } + } } From d730b6bdaa92f2ca19cc8852ac58035e28d47a4f Mon Sep 17 00:00:00 2001 From: Linhong Liu Date: Mon, 7 Dec 2020 13:25:43 +0000 Subject: [PATCH 129/150] [SPARK-32680][SQL] Don't Preprocess V2 CTAS with Unresolved Query ### What changes were proposed in this pull request? The analyzer rule `PreprocessTableCreation` will preprocess table creation related logical plan. But for CTAS, if the sub-query can't be resolved, preprocess it will cause "Invalid call to toAttribute on unresolved object" (instead of a user-friendly error msg: "table or view not found"). This PR fixes this wrongly preprocess for CTAS using V2 catalog. ### Why are the changes needed? bug fix ### Does this PR introduce _any_ user-facing change? The error message for CTAS with a non-exists table changed from: `UnresolvedException: Invalid call to toAttribute on unresolved object, tree: xxx` to `AnalysisException: Table or view not found: xxx` ### How was this patch tested? added test Closes #30637 from linhongliu-db/fix-ctas. Authored-by: Linhong Liu Signed-off-by: Wenchen Fan --- .../apache/spark/sql/execution/datasources/rules.scala | 2 +- .../test/scala/org/apache/spark/sql/DataFrameSuite.scala | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index 2cc78258378ab..b9866e415c9b1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -239,7 +239,7 @@ case class PreprocessTableCreation(sparkSession: SparkSession) extends Rule[Logi c.copy(tableDesc = normalizedTable.copy(schema = reorderedSchema)) } - case create: V2CreateTablePlan => + case create: V2CreateTablePlan if create.childrenResolved => val schema = create.tableSchema val partitioning = create.partitioning val identifier = create.tableName diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index d34dcb4fe0c01..a45bf12e8f841 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -36,6 +36,7 @@ import org.apache.spark.sql.catalyst.expressions.Uuid import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, OneRowRelation} import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.connector.FakeV2Provider import org.apache.spark.sql.execution.{FilterExec, QueryExecution, WholeStageCodegenExec} import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.aggregate.HashAggregateExec @@ -2451,6 +2452,14 @@ class DataFrameSuite extends QueryTest assert(e.getMessage.contains("Table or view not found:")) } + test("SPARK-32680: Don't analyze CTAS with unresolved query") { + val v2Source = classOf[FakeV2Provider].getName + val e = intercept[AnalysisException] { + sql(s"CREATE TABLE t USING $v2Source AS SELECT * from nonexist") + } + assert(e.getMessage.contains("Table or view not found:")) + } + test("CalendarInterval reflection support") { val df = Seq((1, new CalendarInterval(1, 2, 3))).toDF("a", "b") checkAnswer(df.selectExpr("b"), Row(new CalendarInterval(1, 2, 3))) From da72b87374a7be5416b99ed016dc2fc9da0ed88a Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Mon, 7 Dec 2020 13:40:15 +0000 Subject: [PATCH 130/150] [SPARK-33641][SQL] Invalidate new char/varchar types in public APIs that produce incorrect results ### What changes were proposed in this pull request? In this PR, we suppose to narrow the use cases of the char/varchar data types, of which are invalid now or later ### Why are the changes needed? 1. udf ```scala scala> spark.udf.register("abcd", () => "12345", org.apache.spark.sql.types.VarcharType(2)) scala> spark.sql("select abcd()").show scala.MatchError: CharType(2) (of class org.apache.spark.sql.types.VarcharType) at org.apache.spark.sql.catalyst.encoders.RowEncoder$.externalDataTypeFor(RowEncoder.scala:215) at org.apache.spark.sql.catalyst.encoders.RowEncoder$.externalDataTypeForInput(RowEncoder.scala:212) at org.apache.spark.sql.catalyst.expressions.objects.ValidateExternalType.(objects.scala:1741) at org.apache.spark.sql.catalyst.encoders.RowEncoder$.$anonfun$serializerFor$3(RowEncoder.scala:175) at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:245) at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36) at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198) at scala.collection.TraversableLike.flatMap(TraversableLike.scala:245) at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:242) at scala.collection.mutable.ArrayOps$ofRef.flatMap(ArrayOps.scala:198) at org.apache.spark.sql.catalyst.encoders.RowEncoder$.serializerFor(RowEncoder.scala:171) at org.apache.spark.sql.catalyst.encoders.RowEncoder$.apply(RowEncoder.scala:66) at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:99) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:768) at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:96) at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:611) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:768) at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:606) ... 47 elided ``` 2. spark.createDataframe ``` scala> spark.createDataFrame(spark.read.text("README.md").rdd, new org.apache.spark.sql.types.StructType().add("c", "char(1)")).show +--------------------+ | c| +--------------------+ | # Apache Spark| | | |Spark is a unifie...| |high-level APIs i...| |supports general ...| |rich set of highe...| |MLlib for machine...| |and Structured St...| | | | spark.read.schema("a varchar(2)").text("./README.md").show(100) +--------------------+ | a| +--------------------+ | # Apache Spark| | | |Spark is a unifie...| |high-level APIs i...| |supports general ...| ``` 4. etc ### Does this PR introduce _any_ user-facing change? NO, we intend to avoid protentical breaking change ### How was this patch tested? new tests Closes #30586 from yaooqinn/SPARK-33641. Authored-by: Kent Yao Signed-off-by: Wenchen Fan --- .../sql/catalyst/expressions/ExprUtils.scala | 6 +- .../sql/catalyst/parser/AstBuilder.scala | 19 +-- .../sql/catalyst/parser/ParseDriver.scala | 5 - .../sql/catalyst/parser/ParserInterface.scala | 6 - .../sql/catalyst/util/CharVarcharUtils.scala | 38 +++++- .../apache/spark/sql/internal/SQLConf.scala | 13 ++ .../apache/spark/sql/types/VarcharType.scala | 2 +- .../catalyst/parser/DataTypeParserSuite.scala | 14 +-- .../parser/TableSchemaParserSuite.scala | 4 +- .../spark/sql/types/DataTypeSuite.scala | 10 ++ .../scala/org/apache/spark/sql/Column.scala | 2 +- .../apache/spark/sql/DataFrameReader.scala | 7 +- .../org/apache/spark/sql/SparkSession.scala | 10 +- .../apache/spark/sql/UDFRegistration.scala | 73 +++++++---- .../datasources/jdbc/JdbcUtils.scala | 7 +- .../org/apache/spark/sql/functions.scala | 12 +- .../spark/sql/CharVarcharTestSuite.scala | 114 ++++++++++++------ .../sql/SparkSessionExtensionSuite.scala | 3 - .../spark/sql/jdbc/JDBCWriteSuite.scala | 5 +- .../sql/hive/client/HiveClientImpl.scala | 2 +- 20 files changed, 226 insertions(+), 126 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala index 56bd3d7026d52..b45bbe417caf4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala @@ -21,7 +21,7 @@ import java.text.{DecimalFormat, DecimalFormatSymbols, ParsePosition} import java.util.Locale import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.util.ArrayBasedMapData +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, CharVarcharUtils} import org.apache.spark.sql.types.{DataType, MapType, StringType, StructType} import org.apache.spark.unsafe.types.UTF8String @@ -30,7 +30,9 @@ object ExprUtils { def evalTypeExpr(exp: Expression): DataType = { if (exp.foldable) { exp.eval() match { - case s: UTF8String if s != null => DataType.fromDDL(s.toString) + case s: UTF8String if s != null => + val dataType = DataType.fromDDL(s.toString) + CharVarcharUtils.failIfHasCharVarchar(dataType) case _ => throw new AnalysisException( s"The expression '${exp.sql}' is not a valid schema string.") } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 12c5e0de686fa..a22383c62bf74 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -95,19 +95,14 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } override def visitSingleDataType(ctx: SingleDataTypeContext): DataType = withOrigin(ctx) { - visitSparkDataType(ctx.dataType) + typedVisit[DataType](ctx.dataType) } override def visitSingleTableSchema(ctx: SingleTableSchemaContext): StructType = { - val schema = CharVarcharUtils.replaceCharVarcharWithStringInSchema( - StructType(visitColTypeList(ctx.colTypeList))) + val schema = StructType(visitColTypeList(ctx.colTypeList)) withOrigin(ctx)(schema) } - def parseRawDataType(ctx: SingleDataTypeContext): DataType = withOrigin(ctx) { - typedVisit[DataType](ctx.dataType()) - } - /* ******************************************************************************************** * Plan parsing * ******************************************************************************************** */ @@ -1550,7 +1545,9 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg * Create a [[Cast]] expression. */ override def visitCast(ctx: CastContext): Expression = withOrigin(ctx) { - Cast(expression(ctx.expression), visitSparkDataType(ctx.dataType)) + val rawDataType = typedVisit[DataType](ctx.dataType()) + val dataType = CharVarcharUtils.replaceCharVarcharWithStringForCast(rawDataType) + Cast(expression(ctx.expression), dataType) } /** @@ -2229,12 +2226,6 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg /* ******************************************************************************************** * DataType parsing * ******************************************************************************************** */ - /** - * Create a Spark DataType. - */ - private def visitSparkDataType(ctx: DataTypeContext): DataType = { - CharVarcharUtils.replaceCharVarcharWithString(typedVisit(ctx)) - } /** * Resolve/create a primitive type. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala index ac3fbbf6b0512..d08be467f96cc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala @@ -39,11 +39,6 @@ abstract class AbstractSqlParser extends ParserInterface with SQLConfHelper with astBuilder.visitSingleDataType(parser.singleDataType()) } - /** Similar to `parseDataType`, but without CHAR/VARCHAR replacement. */ - override def parseRawDataType(sqlText: String): DataType = parse(sqlText) { parser => - astBuilder.parseRawDataType(parser.singleDataType()) - } - /** Creates Expression for a given SQL string. */ override def parseExpression(sqlText: String): Expression = parse(sqlText) { parser => astBuilder.visitSingleExpression(parser.singleExpression()) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala index d724933bc1029..77e357ad073da 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala @@ -70,10 +70,4 @@ trait ParserInterface { */ @throws[ParseException]("Text cannot be parsed to a DataType") def parseDataType(sqlText: String): DataType - - /** - * Parse a string to a raw [[DataType]] without CHAR/VARCHAR replacement. - */ - @throws[ParseException]("Text cannot be parsed to a DataType") - def parseRawDataType(sqlText: String): DataType } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala index 0cbe5abdbbd7a..b551d9699f360 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala @@ -19,11 +19,14 @@ package org.apache.spark.sql.catalyst.util import scala.collection.mutable +import org.apache.spark.internal.Logging +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.parser.CatalystSqlParser +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ -object CharVarcharUtils { +object CharVarcharUtils extends Logging { private val CHAR_VARCHAR_TYPE_STRING_METADATA_KEY = "__CHAR_VARCHAR_TYPE_STRING" @@ -52,6 +55,19 @@ object CharVarcharUtils { dt.existsRecursively(f => f.isInstanceOf[CharType] || f.isInstanceOf[VarcharType]) } + /** + * Validate the given [[DataType]] to fail if it is char or varchar types or contains nested ones + */ + def failIfHasCharVarchar(dt: DataType): DataType = { + if (!SQLConf.get.charVarcharAsString && hasCharVarchar(dt)) { + throw new AnalysisException("char/varchar type can only be used in the table schema. " + + s"You can set ${SQLConf.LEGACY_CHAR_VARCHAR_AS_STRING.key} to true, so that Spark" + + s" treat them as string type as same as Spark 3.0 and earlier") + } else { + replaceCharVarcharWithString(dt) + } + } + /** * Replaces CharType/VarcharType with StringType recursively in the given data type. */ @@ -69,6 +85,24 @@ object CharVarcharUtils { case _ => dt } + /** + * Replaces CharType/VarcharType with StringType recursively in the given data type, with a + * warning message if it has char or varchar types + */ + def replaceCharVarcharWithStringForCast(dt: DataType): DataType = { + if (SQLConf.get.charVarcharAsString) { + replaceCharVarcharWithString(dt) + } else if (hasCharVarchar(dt)) { + logWarning("The Spark cast operator does not support char/varchar type and simply treats" + + " them as string type. Please use string type directly to avoid confusion. Otherwise," + + s" you can set ${SQLConf.LEGACY_CHAR_VARCHAR_AS_STRING.key} to true, so that Spark treat" + + s" them as string type as same as Spark 3.0 and earlier") + replaceCharVarcharWithString(dt) + } else { + dt + } + } + /** * Removes the metadata entry that contains the original type string of CharType/VarcharType from * the given attribute's metadata. @@ -85,7 +119,7 @@ object CharVarcharUtils { */ def getRawType(metadata: Metadata): Option[DataType] = { if (metadata.contains(CHAR_VARCHAR_TYPE_STRING_METADATA_KEY)) { - Some(CatalystSqlParser.parseRawDataType( + Some(CatalystSqlParser.parseDataType( metadata.getString(CHAR_VARCHAR_TYPE_STRING_METADATA_KEY))) } else { None diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index ea30832008b56..69f04e11ff0bc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2962,6 +2962,17 @@ object SQLConf { .booleanConf .createWithDefault(true) + val LEGACY_CHAR_VARCHAR_AS_STRING = + buildConf("spark.sql.legacy.charVarcharAsString") + .internal() + .doc("When true, Spark will not fail if user uses char and varchar type directly in those" + + " APIs that accept or parse data types as parameters, e.g." + + " `SparkSession.read.schema(...)`, `SparkSession.udf.register(...)` but treat them as" + + " string type as Spark 3.0 and earlier.") + .version("3.1.0") + .booleanConf + .createWithDefault(false) + /** * Holds information about keys that have been deprecated. * @@ -3612,6 +3623,8 @@ class SQLConf extends Serializable with Logging { def disabledJdbcConnectionProviders: String = getConf(SQLConf.DISABLED_JDBC_CONN_PROVIDER_LIST) + def charVarcharAsString: Boolean = getConf(SQLConf.LEGACY_CHAR_VARCHAR_AS_STRING) + /** ********************** SQLConf functionality methods ************ */ /** Set Spark SQL configuration properties. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/VarcharType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/VarcharType.scala index 8d78640c1e125..2e30820ef0a05 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/VarcharType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/VarcharType.scala @@ -32,6 +32,6 @@ case class VarcharType(length: Int) extends AtomicType { override def defaultSize: Int = length override def typeName: String = s"varchar($length)" - override def toString: String = s"CharType($length)" + override def toString: String = s"VarcharType($length)" private[spark] override def asNullable: VarcharType = this } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala index 655b1d26d6c90..b9f984001523a 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala @@ -56,10 +56,10 @@ class DataTypeParserSuite extends SparkFunSuite { checkDataType("DATE", DateType) checkDataType("timestamp", TimestampType) checkDataType("string", StringType) - checkDataType("ChaR(5)", StringType) - checkDataType("ChaRacter(5)", StringType) - checkDataType("varchAr(20)", StringType) - checkDataType("cHaR(27)", StringType) + checkDataType("ChaR(5)", CharType(5)) + checkDataType("ChaRacter(5)", CharType(5)) + checkDataType("varchAr(20)", VarcharType(20)) + checkDataType("cHaR(27)", CharType(27)) checkDataType("BINARY", BinaryType) checkDataType("void", NullType) checkDataType("interval", CalendarIntervalType) @@ -103,9 +103,9 @@ class DataTypeParserSuite extends SparkFunSuite { StructType( StructField("deciMal", DecimalType.USER_DEFAULT, true) :: StructField("anotherDecimal", DecimalType(5, 2), true) :: Nil), true) :: - StructField("MAP", MapType(TimestampType, StringType), true) :: + StructField("MAP", MapType(TimestampType, VarcharType(10)), true) :: StructField("arrAy", ArrayType(DoubleType, true), true) :: - StructField("anotherArray", ArrayType(StringType, true), true) :: Nil) + StructField("anotherArray", ArrayType(CharType(9), true), true) :: Nil) ) // Use backticks to quote column names having special characters. checkDataType( @@ -113,7 +113,7 @@ class DataTypeParserSuite extends SparkFunSuite { StructType( StructField("x+y", IntegerType, true) :: StructField("!@#$%^&*()", StringType, true) :: - StructField("1_2.345<>:\"", StringType, true) :: Nil) + StructField("1_2.345<>:\"", VarcharType(20), true) :: Nil) ) // Empty struct. checkDataType("strUCt<>", StructType(Nil)) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala index 95851d44b4747..5519f016e48d3 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.catalyst.parser import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.types._ class TableSchemaParserSuite extends SparkFunSuite { @@ -69,8 +68,7 @@ class TableSchemaParserSuite extends SparkFunSuite { StructField("arrAy", ArrayType(DoubleType)) :: StructField("anotherArray", ArrayType(CharType(9))) :: Nil)) :: Nil) - assert(parse(tableSchemaString) === - CharVarcharUtils.replaceCharVarcharWithStringInSchema(expectedDataType)) + assert(parse(tableSchemaString) === expectedDataType) } // Negative cases diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala index 9442a3e87fc72..8c2e5db6e9364 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala @@ -249,6 +249,12 @@ class DataTypeSuite extends SparkFunSuite { checkDataTypeFromJson(MapType(IntegerType, ArrayType(DoubleType), false)) checkDataTypeFromDDL(MapType(IntegerType, ArrayType(DoubleType), false)) + checkDataTypeFromJson(CharType(1)) + checkDataTypeFromDDL(CharType(1)) + + checkDataTypeFromJson(VarcharType(10)) + checkDataTypeFromDDL(VarcharType(11)) + val metadata = new MetadataBuilder() .putString("name", "age") .build() @@ -310,6 +316,10 @@ class DataTypeSuite extends SparkFunSuite { checkDefaultSize(MapType(IntegerType, StringType, true), 24) checkDefaultSize(MapType(IntegerType, ArrayType(DoubleType), false), 12) checkDefaultSize(structType, 20) + checkDefaultSize(CharType(5), 5) + checkDefaultSize(CharType(100), 100) + checkDefaultSize(VarcharType(5), 5) + checkDefaultSize(VarcharType(10), 10) def checkEqualsIgnoreCompatibleNullability( from: DataType, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index 86ba81340272b..4ef23d7e31c59 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -1185,7 +1185,7 @@ class Column(val expr: Expression) extends Logging { * @since 1.3.0 */ def cast(to: DataType): Column = withExpr { - Cast(expr, CharVarcharUtils.replaceCharVarcharWithString(to)) + Cast(expr, CharVarcharUtils.replaceCharVarcharWithStringForCast(to)) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 007df183ee353..b94c42a2c9544 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -73,7 +73,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * @since 1.4.0 */ def schema(schema: StructType): DataFrameReader = { - this.userSpecifiedSchema = Option(CharVarcharUtils.replaceCharVarcharWithStringInSchema(schema)) + val replaced = CharVarcharUtils.failIfHasCharVarchar(schema).asInstanceOf[StructType] + this.userSpecifiedSchema = Option(replaced) this } @@ -89,7 +90,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * @since 2.3.0 */ def schema(schemaString: String): DataFrameReader = { - this.userSpecifiedSchema = Option(StructType.fromDDL(schemaString)) + val rawSchema = StructType.fromDDL(schemaString) + val schema = CharVarcharUtils.failIfHasCharVarchar(rawSchema).asInstanceOf[StructType] + this.userSpecifiedSchema = Option(schema) this } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index 3a9b06940b769..a2c9406f6becf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -38,6 +38,7 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.encoders._ import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Range} +import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.connector.ExternalCommandRunner import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.command.ExternalCommandExecutor @@ -347,9 +348,10 @@ class SparkSession private( */ @DeveloperApi def createDataFrame(rowRDD: RDD[Row], schema: StructType): DataFrame = withActive { + val replaced = CharVarcharUtils.failIfHasCharVarchar(schema).asInstanceOf[StructType] // TODO: use MutableProjection when rowRDD is another DataFrame and the applied // schema differs from the existing schema on any field data type. - val encoder = RowEncoder(schema) + val encoder = RowEncoder(replaced) val toRow = encoder.createSerializer() val catalystRows = rowRDD.map(toRow) internalCreateDataFrame(catalystRows.setName(rowRDD.name), schema) @@ -365,7 +367,8 @@ class SparkSession private( */ @DeveloperApi def createDataFrame(rowRDD: JavaRDD[Row], schema: StructType): DataFrame = { - createDataFrame(rowRDD.rdd, schema) + val replaced = CharVarcharUtils.failIfHasCharVarchar(schema).asInstanceOf[StructType] + createDataFrame(rowRDD.rdd, replaced) } /** @@ -378,7 +381,8 @@ class SparkSession private( */ @DeveloperApi def createDataFrame(rows: java.util.List[Row], schema: StructType): DataFrame = withActive { - Dataset.ofRows(self, LocalRelation.fromExternalRows(schema.toAttributes, rows.asScala.toSeq)) + val replaced = CharVarcharUtils.failIfHasCharVarchar(schema).asInstanceOf[StructType] + Dataset.ofRows(self, LocalRelation.fromExternalRows(replaced.toAttributes, rows.asScala.toSeq)) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala index cceb38558946e..237cfe18ed855 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala @@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.{JavaTypeInference, ScalaReflection} import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUDF} +import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.execution.aggregate.ScalaUDAF import org.apache.spark.sql.execution.python.UserDefinedPythonFunction import org.apache.spark.sql.expressions.{SparkUserDefinedFunction, UserDefinedAggregateFunction, UserDefinedAggregator, UserDefinedFunction} @@ -162,9 +163,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends | * @since $version | */ |def register(name: String, f: UDF$i[$extTypeArgs], returnType: DataType): Unit = { + | val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) | val func = $funcCall | def builder(e: Seq[Expression]) = if (e.length == $i) { - | ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + | ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) | } else { | throw new AnalysisException("Invalid number of arguments for function " + name + | ". Expected: $i; Found: " + e.length) @@ -753,9 +755,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 2.3.0 */ def register(name: String, f: UDF0[_], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = () => f.asInstanceOf[UDF0[Any]].call() def builder(e: Seq[Expression]) = if (e.length == 0) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 0; Found: " + e.length) @@ -768,9 +771,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF1[_, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF1[Any, Any]].call(_: Any) def builder(e: Seq[Expression]) = if (e.length == 1) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 1; Found: " + e.length) @@ -783,9 +787,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF2[_, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 2) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 2; Found: " + e.length) @@ -798,9 +803,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF3[_, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 3) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 3; Found: " + e.length) @@ -813,9 +819,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF4[_, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 4) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 4; Found: " + e.length) @@ -828,9 +835,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF5[_, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 5) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 5; Found: " + e.length) @@ -843,9 +851,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF6[_, _, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 6) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 6; Found: " + e.length) @@ -858,9 +867,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 7) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 7; Found: " + e.length) @@ -873,9 +883,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 8) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 8; Found: " + e.length) @@ -888,9 +899,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 9) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 9; Found: " + e.length) @@ -903,9 +915,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 10) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 10; Found: " + e.length) @@ -918,9 +931,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF11[_, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF11[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 11) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 11; Found: " + e.length) @@ -933,9 +947,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF12[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 12) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 12; Found: " + e.length) @@ -948,9 +963,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF13[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 13) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 13; Found: " + e.length) @@ -963,9 +979,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF14[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 14) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 14; Found: " + e.length) @@ -978,9 +995,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 15) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 15; Found: " + e.length) @@ -993,9 +1011,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 16) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 16; Found: " + e.length) @@ -1008,9 +1027,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 17) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 17; Found: " + e.length) @@ -1023,9 +1043,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 18) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 18; Found: " + e.length) @@ -1038,9 +1059,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 19) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 19; Found: " + e.length) @@ -1053,9 +1075,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF20[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 20) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 20; Found: " + e.length) @@ -1068,9 +1091,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF21[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 21) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 21; Found: " + e.length) @@ -1083,9 +1107,10 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 1.3.0 */ def register(name: String, f: UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = { + val replaced = CharVarcharUtils.failIfHasCharVarchar(returnType) val func = f.asInstanceOf[UDF22[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any) def builder(e: Seq[Expression]) = if (e.length == 22) { - ScalaUDF(func, returnType, e, Nil, udfName = Some(name)) + ScalaUDF(func, replaced, e, Nil, udfName = Some(name)) } else { throw new AnalysisException("Invalid number of arguments for function " + name + ". Expected: 22; Found: " + e.length) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala index 216fb02740500..f997e57b23206 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.analysis.Resolver import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow import org.apache.spark.sql.catalyst.parser.CatalystSqlParser -import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CharVarcharUtils, DateTimeUtils, GenericArrayData} +import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils, GenericArrayData} import org.apache.spark.sql.connector.catalog.TableChange import org.apache.spark.sql.execution.datasources.jdbc.connection.ConnectionProvider import org.apache.spark.sql.jdbc.{JdbcDialect, JdbcDialects, JdbcType} @@ -761,10 +761,7 @@ object JdbcUtils extends Logging { schema: StructType, caseSensitive: Boolean, createTableColumnTypes: String): Map[String, String] = { - val parsedSchema = CatalystSqlParser.parseTableSchema(createTableColumnTypes) - val userSchema = StructType(parsedSchema.map { field => - field.copy(dataType = CharVarcharUtils.getRawType(field.metadata).getOrElse(field.dataType)) - }) + val userSchema = CatalystSqlParser.parseTableSchema(createTableColumnTypes) val nameEquality = if (caseSensitive) { org.apache.spark.sql.catalyst.analysis.caseSensitiveResolution } else { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 9861d21d3a430..5b1ee2deefc10 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans.logical.{BROADCAST, HintInfo, ResolvedHint} -import org.apache.spark.sql.catalyst.util.TimestampFormatter +import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, TimestampFormatter} import org.apache.spark.sql.execution.SparkSqlParser import org.apache.spark.sql.expressions.{Aggregator, SparkUserDefinedFunction, UserDefinedAggregator, UserDefinedFunction} import org.apache.spark.sql.internal.SQLConf @@ -4009,7 +4009,7 @@ object functions { * @since 2.2.0 */ def from_json(e: Column, schema: DataType, options: Map[String, String]): Column = withExpr { - JsonToStructs(schema, options, e.expr) + JsonToStructs(CharVarcharUtils.failIfHasCharVarchar(schema), options, e.expr) } /** @@ -4040,8 +4040,9 @@ object functions { * @group collection_funcs * @since 2.2.0 */ - def from_json(e: Column, schema: DataType, options: java.util.Map[String, String]): Column = - from_json(e, schema, options.asScala.toMap) + def from_json(e: Column, schema: DataType, options: java.util.Map[String, String]): Column = { + from_json(e, CharVarcharUtils.failIfHasCharVarchar(schema), options.asScala.toMap) + } /** * Parses a column containing a JSON string into a `StructType` with the specified schema. @@ -4393,7 +4394,8 @@ object functions { * @since 3.0.0 */ def from_csv(e: Column, schema: StructType, options: Map[String, String]): Column = withExpr { - CsvToStructs(schema, options, e.expr) + val replaced = CharVarcharUtils.failIfHasCharVarchar(schema).asInstanceOf[StructType] + CsvToStructs(replaced, options, e.expr) } /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala index abb13270d20e7..fcd334be7a6f7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.SimpleInsertSource import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} -import org.apache.spark.sql.types.{ArrayType, CharType, DataType, MapType, StringType, StructField, StructType} +import org.apache.spark.sql.types._ // The base trait for char/varchar tests that need to be run with different table implementations. trait CharVarcharTestSuite extends QueryTest with SQLTestUtils { @@ -435,55 +435,91 @@ class BasicCharVarcharTestSuite extends QueryTest with SharedSparkSession { assert(df.schema.map(_.dataType) == Seq(StringType)) } - assertNoCharType(spark.range(1).select($"id".cast("char(5)"))) - assertNoCharType(spark.range(1).select($"id".cast(CharType(5)))) - assertNoCharType(spark.range(1).selectExpr("CAST(id AS CHAR(5))")) - assertNoCharType(sql("SELECT CAST(id AS CHAR(5)) FROM range(1)")) + val logAppender = new LogAppender("The Spark cast operator does not support char/varchar" + + " type and simply treats them as string type. Please use string type directly to avoid" + + " confusion.") + withLogAppender(logAppender) { + assertNoCharType(spark.range(1).select($"id".cast("char(5)"))) + assertNoCharType(spark.range(1).select($"id".cast(CharType(5)))) + assertNoCharType(spark.range(1).selectExpr("CAST(id AS CHAR(5))")) + assertNoCharType(sql("SELECT CAST(id AS CHAR(5)) FROM range(1)")) + } } - test("user-specified schema in functions") { - val df = sql("""SELECT from_json('{"a": "str"}', 'a CHAR(5)')""") - checkAnswer(df, Row(Row("str"))) - val schema = df.schema.head.dataType.asInstanceOf[StructType] - assert(schema.map(_.dataType) == Seq(StringType)) + def failWithInvalidCharUsage[T](fn: => T): Unit = { + val e = intercept[AnalysisException](fn) + assert(e.getMessage contains "char/varchar type can only be used in the table schema") } - test("user-specified schema in DataFrameReader: file source from Dataset") { - val ds = spark.range(10).map(_.toString) - val df1 = spark.read.schema(new StructType().add("id", CharType(5))).csv(ds) - assert(df1.schema.map(_.dataType) == Seq(StringType)) - val df2 = spark.read.schema("id char(5)").csv(ds) - assert(df2.schema.map(_.dataType) == Seq(StringType)) + test("invalidate char/varchar in functions") { + failWithInvalidCharUsage(sql("""SELECT from_json('{"a": "str"}', 'a CHAR(5)')""")) + withSQLConf((SQLConf.LEGACY_CHAR_VARCHAR_AS_STRING.key, "true")) { + val df = sql("""SELECT from_json('{"a": "str"}', 'a CHAR(5)')""") + checkAnswer(df, Row(Row("str"))) + val schema = df.schema.head.dataType.asInstanceOf[StructType] + assert(schema.map(_.dataType) == Seq(StringType)) + } } - test("user-specified schema in DataFrameReader: DSV1") { - def checkSchema(df: DataFrame): Unit = { - val relations = df.queryExecution.analyzed.collect { - case l: LogicalRelation => l.relation - } - assert(relations.length == 1) - assert(relations.head.schema.map(_.dataType) == Seq(StringType)) + test("invalidate char/varchar in SparkSession createDataframe") { + val df = spark.range(10).map(_.toString).toDF() + val schema = new StructType().add("id", CharType(5)) + failWithInvalidCharUsage(spark.createDataFrame(df.collectAsList(), schema)) + failWithInvalidCharUsage(spark.createDataFrame(df.rdd, schema)) + failWithInvalidCharUsage(spark.createDataFrame(df.toJavaRDD, schema)) + withSQLConf((SQLConf.LEGACY_CHAR_VARCHAR_AS_STRING.key, "true")) { + val df1 = spark.createDataFrame(df.collectAsList(), schema) + checkAnswer(df1, df) + assert(df1.schema.head.dataType === StringType) } - - checkSchema(spark.read.schema(new StructType().add("id", CharType(5))) - .format(classOf[SimpleInsertSource].getName).load()) - checkSchema(spark.read.schema("id char(5)") - .format(classOf[SimpleInsertSource].getName).load()) } - test("user-specified schema in DataFrameReader: DSV2") { - def checkSchema(df: DataFrame): Unit = { - val tables = df.queryExecution.analyzed.collect { - case d: DataSourceV2Relation => d.table + test("invalidate char/varchar in spark.read.schema") { + failWithInvalidCharUsage(spark.read.schema(new StructType().add("id", CharType(5)))) + failWithInvalidCharUsage(spark.read.schema("id char(5)")) + withSQLConf((SQLConf.LEGACY_CHAR_VARCHAR_AS_STRING.key, "true")) { + val ds = spark.range(10).map(_.toString) + val df1 = spark.read.schema(new StructType().add("id", CharType(5))).csv(ds) + assert(df1.schema.map(_.dataType) == Seq(StringType)) + val df2 = spark.read.schema("id char(5)").csv(ds) + assert(df2.schema.map(_.dataType) == Seq(StringType)) + + def checkSchema(df: DataFrame): Unit = { + val schemas = df.queryExecution.analyzed.collect { + case l: LogicalRelation => l.relation.schema + case d: DataSourceV2Relation => d.table.schema() + } + assert(schemas.length == 1) + assert(schemas.head.map(_.dataType) == Seq(StringType)) } - assert(tables.length == 1) - assert(tables.head.schema.map(_.dataType) == Seq(StringType)) - } - checkSchema(spark.read.schema(new StructType().add("id", CharType(5))) - .format(classOf[SchemaRequiredDataSource].getName).load()) - checkSchema(spark.read.schema("id char(5)") - .format(classOf[SchemaRequiredDataSource].getName).load()) + // user-specified schema in DataFrameReader: DSV1 + checkSchema(spark.read.schema(new StructType().add("id", CharType(5))) + .format(classOf[SimpleInsertSource].getName).load()) + checkSchema(spark.read.schema("id char(5)") + .format(classOf[SimpleInsertSource].getName).load()) + + // user-specified schema in DataFrameReader: DSV2 + checkSchema(spark.read.schema(new StructType().add("id", CharType(5))) + .format(classOf[SchemaRequiredDataSource].getName).load()) + checkSchema(spark.read.schema("id char(5)") + .format(classOf[SchemaRequiredDataSource].getName).load()) + } + } + + test("invalidate char/varchar in udf's result type") { + failWithInvalidCharUsage(spark.udf.register("testchar", () => "B", VarcharType(1))) + failWithInvalidCharUsage(spark.udf.register("testchar2", (x: String) => x, VarcharType(1))) + withSQLConf((SQLConf.LEGACY_CHAR_VARCHAR_AS_STRING.key, "true")) { + spark.udf.register("testchar", () => "B", VarcharType(1)) + spark.udf.register("testchar2", (x: String) => x, VarcharType(1)) + val df1 = spark.sql("select testchar()") + checkAnswer(df1, Row("B")) + assert(df1.schema.head.dataType === StringType) + val df2 = spark.sql("select testchar2('abc')") + checkAnswer(df2, Row("abc")) + assert(df2.schema.head.dataType === StringType) + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala index f02d2041dd7f3..ea276bcec0f78 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala @@ -384,9 +384,6 @@ case class MyParser(spark: SparkSession, delegate: ParserInterface) extends Pars override def parseDataType(sqlText: String): DataType = delegate.parseDataType(sqlText) - - override def parseRawDataType(sqlText: String): DataType = - delegate.parseRawDataType(sqlText) } object MyExtensions { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala index fb46c2ff4c0ea..1a28523cc939f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala @@ -390,14 +390,13 @@ class JDBCWriteSuite extends SharedSparkSession with BeforeAndAfter { .foldLeft(new StructType())((schema, colType) => schema.add(colType._1, colType._2)) val createTableColTypes = colTypes.map { case (col, dataType) => s"$col $dataType" }.mkString(", ") - val df = spark.createDataFrame(sparkContext.parallelize(Seq(Row.empty)), schema) val expectedSchemaStr = colTypes.map { case (col, dataType) => s""""$col" $dataType """ }.mkString(", ") assert(JdbcUtils.schemaString( - df.schema, - df.sqlContext.conf.caseSensitiveAnalysis, + schema, + spark.sqlContext.conf.caseSensitiveAnalysis, url1, Option(createTableColTypes)) == expectedSchemaStr) } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index bada131c8ba6d..34befb8a6f965 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -985,7 +985,7 @@ private[hive] object HiveClientImpl extends Logging { /** Get the Spark SQL native DataType from Hive's FieldSchema. */ private def getSparkSQLDataType(hc: FieldSchema): DataType = { try { - CatalystSqlParser.parseRawDataType(hc.getType) + CatalystSqlParser.parseDataType(hc.getType) } catch { case e: ParseException => throw new SparkException( From c62b84a0432e51fd10e628088ee311dc3be73d2f Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Mon, 7 Dec 2020 08:40:29 -0600 Subject: [PATCH 131/150] [MINOR] Spelling sql not core ### What changes were proposed in this pull request? This PR intends to fix typos in the sub-modules: * `sql/catalyst` * `sql/hive-thriftserver` * `sql/hive` Split per srowen https://github.com/apache/spark/pull/30323#issuecomment-728981618 NOTE: The misspellings have been reported at https://github.com/jsoref/spark/commit/706a726f87a0bbf5e31467fae9015218773db85b#commitcomment-44064356 ### Why are the changes needed? Misspelled words make it harder to read / understand content. ### Does this PR introduce _any_ user-facing change? There are various fixes to documentation, etc... ### How was this patch tested? No testing was performed Closes #30532 from jsoref/spelling-sql-not-core. Authored-by: Josh Soref Signed-off-by: Sean Owen --- .../src/main/scala/org/apache/spark/sql/Row.scala | 6 +++--- .../apache/spark/sql/catalyst/StructFilters.scala | 2 +- .../spark/sql/catalyst/analysis/Analyzer.scala | 2 +- .../spark/sql/catalyst/analysis/CheckAnalysis.scala | 2 +- .../sql/catalyst/analysis/StreamingJoinHelper.scala | 4 ++-- .../analysis/UpdateAttributeNullability.scala | 2 +- .../spark/sql/catalyst/catalog/SessionCatalog.scala | 2 +- .../spark/sql/catalyst/csv/CSVInferSchema.scala | 2 +- .../sql/catalyst/encoders/ExpressionEncoder.scala | 2 +- .../spark/sql/catalyst/expressions/AliasHelper.scala | 2 +- .../spark/sql/catalyst/expressions/ScalaUDF.scala | 4 ++-- .../catalyst/expressions/aggregate/Percentile.scala | 6 +++--- .../spark/sql/catalyst/expressions/arithmetic.scala | 2 +- .../catalyst/expressions/codegen/CodeGenerator.scala | 2 +- .../expressions/codegen/GenerateSafeProjection.scala | 2 +- .../apache/spark/sql/catalyst/expressions/hash.scala | 4 ++-- .../catalyst/expressions/higherOrderFunctions.scala | 2 +- .../sql/catalyst/expressions/jsonExpressions.scala | 12 ++++++------ .../sql/catalyst/expressions/regexpExpressions.scala | 2 +- .../sql/catalyst/expressions/windowExpressions.scala | 2 +- .../catalyst/optimizer/NestedColumnAliasing.scala | 2 +- .../spark/sql/catalyst/optimizer/Optimizer.scala | 4 ++-- .../optimizer/PushDownLeftSemiAntiJoin.scala | 2 +- .../spark/sql/catalyst/optimizer/expressions.scala | 2 +- .../spark/sql/catalyst/optimizer/subquery.scala | 6 +++--- .../spark/sql/catalyst/parser/ParserUtils.scala | 2 +- .../apache/spark/sql/catalyst/plans/QueryPlan.scala | 2 +- .../sql/catalyst/plans/logical/LogicalPlan.scala | 2 +- .../sql/catalyst/plans/logical/PlanHelper.scala | 2 +- .../plans/logical/basicLogicalOperators.scala | 2 +- .../sql/catalyst/plans/physical/partitioning.scala | 2 +- .../sql/catalyst/util/DateTimeFormatterHelper.scala | 4 ++-- .../spark/sql/catalyst/util/DateTimeUtils.scala | 2 +- .../spark/sql/catalyst/util/QuantileSummaries.scala | 6 +++--- .../org/apache/spark/sql/internal/SQLConf.scala | 6 +++--- .../org/apache/spark/sql/RandomDataGenerator.scala | 6 +++--- .../spark/sql/catalyst/analysis/AnalysisSuite.scala | 2 +- .../analysis/ResolveGroupingAnalyticsSuite.scala | 4 ++-- .../sql/catalyst/analysis/TypeCoercionSuite.scala | 2 +- .../analysis/UnsupportedOperationsSuite.scala | 2 +- .../catalyst/expressions/CodeGenerationSuite.scala | 4 ++-- .../sql/catalyst/expressions/ComplexTypeSuite.scala | 4 ++-- .../expressions/ConditionalExpressionSuite.scala | 4 ++-- .../catalyst/expressions/ExpressionEvalHelper.scala | 2 +- .../expressions/ObjectExpressionsSuite.scala | 4 ++-- .../expressions/StringExpressionsSuite.scala | 2 +- .../expressions/aggregate/PercentileSuite.scala | 8 ++++---- .../expressions/codegen/CodeBlockSuite.scala | 2 +- .../sql/catalyst/optimizer/SetOperationSuite.scala | 8 ++++---- .../spark/sql/catalyst/parser/DDLParserSuite.scala | 2 +- .../sql/catalyst/parser/DataTypeParserSuite.scala | 4 ++-- .../spark/sql/catalyst/parser/ErrorParserSuite.scala | 2 +- .../sql/catalyst/parser/ExpressionParserSuite.scala | 4 ++-- .../catalyst/parser/TableIdentifierParserSuite.scala | 2 +- .../spark/sql/catalyst/util/UnsafeArraySuite.scala | 8 ++++---- .../apache/hive/service/cli/ColumnDescriptor.java | 2 +- .../org/apache/hive/service/cli/GetInfoValue.java | 2 +- .../service/cli/operation/GetColumnsOperation.java | 2 +- .../hive/service/cli/session/HiveSessionImpl.java | 4 ++-- .../service/cli/thrift/ThriftHttpCLIService.java | 2 +- .../spark/sql/hive/thriftserver/DummyListeners.scala | 2 +- .../sql/hive/thriftserver/SparkSQLEnvSuite.scala | 2 +- .../sql/hive/execution/HiveCompatibilitySuite.scala | 12 ++++++------ .../apache/spark/sql/hive/HiveMetastoreCatalog.scala | 2 +- .../spark/sql/hive/client/HiveClientImpl.scala | 4 ++-- .../execution/HiveScriptTransformationExec.scala | 2 +- .../sql/hive/execution/InsertIntoHiveTable.scala | 2 +- .../hive/execution/PruneHiveTablePartitions.scala | 2 +- .../queries/clientpositive/auto_sortmerge_join_13.q | 6 +++--- .../clientpositive/bucketsortoptimize_insert_3.q | 4 ++-- .../src/test/queries/clientpositive/smb_mapjoin_20.q | 2 +- .../org/apache/spark/sql/hive/InsertSuite.scala | 4 ++-- .../org/apache/spark/sql/hive/StatisticsSuite.scala | 2 +- .../execution/HiveScriptTransformationSuite.scala | 4 ++-- .../spark/sql/hive/execution/SQLQuerySuite.scala | 8 ++++---- 75 files changed, 128 insertions(+), 128 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala index 88c672f1cdf85..d43c57ed0f5c8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala @@ -351,7 +351,7 @@ trait Row extends Serializable { /** * Returns the value at position i. * For primitive types if value is null it returns 'zero value' specific for primitive - * ie. 0 for Int - use isNullAt to ensure that value is not null + * i.e. 0 for Int - use isNullAt to ensure that value is not null * * @throws ClassCastException when data type does not match. */ @@ -360,7 +360,7 @@ trait Row extends Serializable { /** * Returns the value of a given fieldName. * For primitive types if value is null it returns 'zero value' specific for primitive - * ie. 0 for Int - use isNullAt to ensure that value is not null + * i.e. 0 for Int - use isNullAt to ensure that value is not null * * @throws UnsupportedOperationException when schema is not defined. * @throws IllegalArgumentException when fieldName do not exist. @@ -381,7 +381,7 @@ trait Row extends Serializable { /** * Returns a Map consisting of names and values for the requested fieldNames * For primitive types if value is null it returns 'zero value' specific for primitive - * ie. 0 for Int - use isNullAt to ensure that value is not null + * i.e. 0 for Int - use isNullAt to ensure that value is not null * * @throws UnsupportedOperationException when schema is not defined. * @throws IllegalArgumentException when fieldName do not exist. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/StructFilters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/StructFilters.scala index fed1b323f5773..ff67b6fccfae9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/StructFilters.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/StructFilters.scala @@ -51,7 +51,7 @@ abstract class StructFilters(pushedFilters: Seq[sources.Filter], schema: StructT /** * Resets states of pushed down filters. The method must be called before - * precessing any new row otherwise `skipRow()` may return wrong result. + * processing any new row otherwise `skipRow()` may return wrong result. */ def reset(): Unit diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 6769dc895d32e..6541961f5613e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -1495,7 +1495,7 @@ class Analyzer(override val catalogManager: CatalogManager) val rightRes = rightAttributes .map(x => resolveExpressionBottomUp(x, right).asInstanceOf[Attribute]) f.copy(leftAttributes = leftRes, rightAttributes = rightRes) - // intersect/except will be rewritten to join at the begininng of optimizer. Here we need to + // intersect/except will be rewritten to join at the beginning of optimizer. Here we need to // deduplicate the right side plan, so that we won't produce an invalid self-join later. case i @ Intersect(left, right, _) if !i.duplicateResolved => i.copy(right = dedupRight(left, right)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 11c4883992560..9f5eefc744135 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -579,7 +579,7 @@ trait CheckAnalysis extends PredicateHelper { case showPartitions: ShowPartitions => checkShowPartitions(showPartitions) - case _ => // Fallbacks to the following checks + case _ => // Falls back to the following checks } operator match { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelper.scala index cddc3a44f4d9d..d8e200d6b01e4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelper.scala @@ -55,7 +55,7 @@ object StreamingJoinHelper extends PredicateHelper with Logging { * given the join condition and the event time watermark. This is how it works. * - The condition is split into conjunctive predicates, and we find the predicates of the * form `leftTime + c1 < rightTime + c2` (or <=, >, >=). - * - We canoncalize the predicate and solve it with the event time watermark value to find the + * - We canonicalize the predicate and solve it with the event time watermark value to find the * value of the state watermark. * This function is supposed to make best-effort attempt to get the state watermark. If there is * any error, it will return None. @@ -94,7 +94,7 @@ object StreamingJoinHelper extends PredicateHelper with Logging { // The generated the state watermark cleanup expression is inclusive of the state watermark. // If state watermark is W, all state where timestamp <= W will be cleaned up. - // Now when the canonicalized join condition solves to leftTime >= W, we dont want to clean + // Now when the canonicalized join condition solves to leftTime >= W, we don't want to clean // up leftTime <= W. Rather we should clean up leftTime <= W - 1. Hence the -1 below. val stateWatermark = predicate match { case LessThan(l, r) => getStateWatermarkSafely(l, r) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UpdateAttributeNullability.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UpdateAttributeNullability.scala index 3eae34da7e502..5004108d348b6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UpdateAttributeNullability.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UpdateAttributeNullability.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.rules.Rule * Updates nullability of Attributes in a resolved LogicalPlan by using the nullability of * corresponding Attributes of its children output Attributes. This step is needed because * users can use a resolved AttributeReference in the Dataset API and outer joins - * can change the nullability of an AttribtueReference. Without this rule, a nullable column's + * can change the nullability of an AttributeReference. Without this rule, a nullable column's * nullable field can be actually set as non-nullable, which cause illegal optimization * (e.g., NULL propagation) and wrong answers. * See SPARK-13484 and SPARK-13801 for the concrete queries of this case. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index a2ab756382488..4c32870abe621 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -1615,7 +1615,7 @@ class SessionCatalog( } /** - * Validate the new locatoin before renaming a managed table, which should be non-existent. + * Validate the new location before renaming a managed table, which should be non-existent. */ private def validateNewLocationOfRename( oldName: TableIdentifier, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala index 56677d7d97af2..fd9e30d155148 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala @@ -143,7 +143,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { // The conversion can fail when the `field` is not a form of number. val bigDecimal = decimalParser(field) // Because many other formats do not support decimal, it reduces the cases for - // decimals by disallowing values having scale (eg. `1.1`). + // decimals by disallowing values having scale (e.g. `1.1`). if (bigDecimal.scale <= 0) { // `DecimalType` conversion can fail when // 1. The precision is bigger than 38. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala index 9ab38044e6a88..80a0374ae1f26 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala @@ -189,7 +189,7 @@ object ExpressionEncoder { } /** - * Function that serializesa an object of type `T` to an [[InternalRow]]. This class is not + * Function that serializes an object of type `T` to an [[InternalRow]]. This class is not * thread-safe. Note that multiple calls to `apply(..)` return the same actual [[InternalRow]] * object. Thus, the caller should copy the result before making another call if required. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala index c61eb68db5bfa..ad6cf959a69c6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala @@ -64,7 +64,7 @@ trait AliasHelper { /** * Replace all attributes, that reference an alias, with the aliased expression, - * but keep the name of the outmost attribute. + * but keep the name of the outermost attribute. */ protected def replaceAliasButKeepName( expr: NamedExpression, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala index 0a69d5aa6b9ad..4a89d24e5f635 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala @@ -1145,7 +1145,7 @@ case class ScalaUDF( val resultConverter = s"$convertersTerm[${children.length}]" val boxedType = CodeGenerator.boxedType(dataType) - val funcInvokation = if (isPrimitive(dataType) + val funcInvocation = if (isPrimitive(dataType) // If the output is nullable, the returned value must be unwrapped from the Option && !nullable) { s"$resultTerm = ($boxedType)$getFuncResult" @@ -1156,7 +1156,7 @@ case class ScalaUDF( s""" |$boxedType $resultTerm = null; |try { - | $funcInvokation; + | $funcInvocation; |} catch (Exception e) { | throw new org.apache.spark.SparkException($errorMsgTerm, e); |} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala index 0eba61c741133..b808083152cd3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala @@ -191,13 +191,13 @@ case class Percentile( val sortedCounts = buffer.toSeq.sortBy(_._1)( child.dataType.asInstanceOf[NumericType].ordering.asInstanceOf[Ordering[AnyRef]]) - val accumlatedCounts = sortedCounts.scanLeft((sortedCounts.head._1, 0L)) { + val accumulatedCounts = sortedCounts.scanLeft((sortedCounts.head._1, 0L)) { case ((key1, count1), (key2, count2)) => (key2, count1 + count2) }.tail - val maxPosition = accumlatedCounts.last._2 - 1 + val maxPosition = accumulatedCounts.last._2 - 1 percentages.map { percentile => - getPercentile(accumlatedCounts, maxPosition * percentile) + getPercentile(accumulatedCounts, maxPosition * percentile) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala index c69edccc696bb..3fbb798f1fd53 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala @@ -216,7 +216,7 @@ abstract class BinaryArithmetic extends BinaryOperator with NullIntolerant { case DoubleType | FloatType => // When Double/Float overflows, there can be 2 cases: // - precision loss: according to SQL standard, the number is truncated; - // - returns (+/-)Infinite: same behavior also other DBs have (eg. Postgres) + // - returns (+/-)Infinite: same behavior also other DBs have (e.g. Postgres) nullSafeCodeGen(ctx, ev, (eval1, eval2) => { s""" |${ev.value} = $eval1 $symbol $eval2; diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala index 1ff4a93cf0acd..638878b312dc5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala @@ -175,7 +175,7 @@ class CodegenContext extends Logging { mutable.ArrayBuffer.empty[(String, String)] /** - * The mapping between mutable state types and corrseponding compacted arrays. + * The mapping between mutable state types and corresponding compacted arrays. * The keys are java type string. The values are [[MutableStateArrays]] which encapsulates * the compacted arrays for the mutable states with the same java type. * diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala index e285398ba1958..4efcca0017eaa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, Generic import org.apache.spark.sql.types._ /** - * Java can not access Projection (in package object) + * Java cannot access Projection (in package object) */ abstract class BaseProjection extends Projection {} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala index 64360827fb794..ce177f50956f0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala @@ -907,7 +907,7 @@ object HiveHashFunction extends InterpretedHashFunction { * - year, month (stored as HiveIntervalYearMonth) * - day, hour, minute, second, nanosecond (stored as HiveIntervalDayTime) * - * eg. (INTERVAL '30' YEAR + INTERVAL '-23' DAY) fails in Hive + * e.g. (INTERVAL '30' YEAR + INTERVAL '-23' DAY) fails in Hive * * This method mimics HiveIntervalDayTime.hashCode() in Hive. * @@ -919,7 +919,7 @@ object HiveHashFunction extends InterpretedHashFunction { * * - Spark's [[CalendarInterval]] has precision upto microseconds but Hive's * HiveIntervalDayTime can store data with precision upto nanoseconds. So, any input intervals - * with nanosecond values will lead to wrong output hashes (ie. non adherent with Hive output) + * with nanosecond values will lead to wrong output hashes (i.e. non adherent with Hive output) */ def hashCalendarInterval(calendarInterval: CalendarInterval): Long = { val totalMicroSeconds = calendarInterval.days * MICROS_PER_DAY + calendarInterval.microseconds diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala index 4454afb6c099b..d1dabe732c882 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala @@ -128,7 +128,7 @@ trait HigherOrderFunction extends Expression with ExpectsInputTypes { def argumentTypes: Seq[AbstractDataType] /** - * All arguments have been resolved. This means that the types and nullabilty of (most of) the + * All arguments have been resolved. This means that the types and nullability of (most of) the * lambda function arguments is known, and that we can start binding the lambda functions. */ lazy val argumentsResolved: Boolean = arguments.forall(_.resolved) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index a363615d3afe0..c22b68890a0d6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -808,10 +808,10 @@ case class SchemaOfJson( } /** - * A function that returns the number of elements in the outmost JSON array. + * A function that returns the number of elements in the outermost JSON array. */ @ExpressionDescription( - usage = "_FUNC_(jsonArray) - Returns the number of elements in the outmost JSON array.", + usage = "_FUNC_(jsonArray) - Returns the number of elements in the outermost JSON array.", arguments = """ Arguments: * jsonArray - A JSON array. `NULL` is returned in case of any other valid JSON string, @@ -877,13 +877,13 @@ case class LengthOfJsonArray(child: Expression) extends UnaryExpression } /** - * A function which returns all the keys of the outmost JSON object. + * A function which returns all the keys of the outermost JSON object. */ @ExpressionDescription( - usage = "_FUNC_(json_object) - Returns all the keys of the outmost JSON object as an array.", + usage = "_FUNC_(json_object) - Returns all the keys of the outermost JSON object as an array.", arguments = """ Arguments: - * json_object - A JSON object. If a valid JSON object is given, all the keys of the outmost + * json_object - A JSON object. If a valid JSON object is given, all the keys of the outermost object will be returned as an array. If it is any other valid JSON string, an invalid JSON string or an empty string, the function returns null. """, @@ -921,7 +921,7 @@ case class JsonObjectKeys(child: Expression) extends UnaryExpression with Codege if (parser.nextToken() == null || parser.currentToken() != JsonToken.START_OBJECT) { return null } - // Parse the JSON string to get all the keys of outmost JSON object + // Parse the JSON string to get all the keys of outermost JSON object getJsonKeys(parser, input) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 0b94fe8b5d47e..28c9aefb42837 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -93,7 +93,7 @@ abstract class StringRegexExpression extends BinaryExpression Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order to match "\abc", the pattern should be "\\abc". - When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it fallbacks + When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it falls back to Spark 1.6 behavior regarding string literal parsing. For example, if the config is enabled, the pattern to match "\abc" should be "\abc". * escape - an character added since Spark 3.0. The default escape character is the '\'. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala index b6dd817794723..43ecbd6a83fdb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala @@ -173,7 +173,7 @@ sealed trait WindowFrame extends Expression with Unevaluable { case object UnspecifiedFrame extends WindowFrame /** - * A specified Window Frame. The val lower/uppper can be either a foldable [[Expression]] or a + * A specified Window Frame. The val lower/upper can be either a foldable [[Expression]] or a * [[SpecialFrameBoundary]]. */ case class SpecifiedWindowFrame( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala index b053bf6d61e6b..0be2792bfd7db 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala @@ -227,7 +227,7 @@ object NestedColumnAliasing { } /** - * This prunes unnessary nested columns from `Generate` and optional `Project` on top + * This prunes unnecessary nested columns from `Generate` and optional `Project` on top * of it. */ object GeneratorNestedColumnAliasing { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index b7c8f775b857f..aa8540fb44556 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -853,7 +853,7 @@ object CollapseWindow extends Rule[LogicalPlan] { * of the child window expression, transpose them. */ object TransposeWindow extends Rule[LogicalPlan] { - private def compatibleParititions(ps1 : Seq[Expression], ps2: Seq[Expression]): Boolean = { + private def compatiblePartitions(ps1 : Seq[Expression], ps2: Seq[Expression]): Boolean = { ps1.length < ps2.length && ps2.take(ps1.length).permutations.exists(ps1.zip(_).forall { case (l, r) => l.semanticEquals(r) }) @@ -864,7 +864,7 @@ object TransposeWindow extends Rule[LogicalPlan] { if w1.references.intersect(w2.windowOutputSet).isEmpty && w1.expressions.forall(_.deterministic) && w2.expressions.forall(_.deterministic) && - compatibleParititions(ps1, ps2) => + compatiblePartitions(ps1, ps2) => Project(w1.output, Window(we2, ps2, os2, Window(we1, ps1, os1, grandChild))) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PushDownLeftSemiAntiJoin.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PushDownLeftSemiAntiJoin.scala index 50fe0192d6f26..286b447cdb5a8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PushDownLeftSemiAntiJoin.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PushDownLeftSemiAntiJoin.scala @@ -172,7 +172,7 @@ object PushDownLeftSemiAntiJoin extends Rule[LogicalPlan] with PredicateHelper { * TODO: * Currently this rule can push down the left semi or left anti joins to either * left or right leg of the child join. This matches the behaviour of `PushPredicateThroughJoin` - * when the lefi semi or left anti join is in expression form. We need to explore the possibility + * when the left semi or left anti join is in expression form. We need to explore the possibility * to push the left semi/anti joins to both legs of join if the join condition refers to * both left and right legs of the child join. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala index 1b1e2ad71e7c8..4cdaf10dd3c60 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala @@ -70,7 +70,7 @@ object ConstantFolding extends Rule[LogicalPlan] { /** * Substitutes [[Attribute Attributes]] which can be statically evaluated with their corresponding * value in conjunctive [[Expression Expressions]] - * eg. + * e.g. * {{{ * SELECT * FROM table WHERE i = 5 AND j = i + 3 * ==> SELECT * FROM table WHERE i = 5 AND j = 8 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala index 3c2ee3149d317..9d023b7f11401 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala @@ -63,7 +63,7 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { // the produced join then becomes unresolved and break structural integrity. We should // de-duplicate conflicting attributes. // SPARK-26078: it may also happen that the subquery has conflicting attributes with the outer - // values. In this case, the resulting join would contain trivially true conditions (eg. + // values. In this case, the resulting join would contain trivially true conditions (e.g. // id#3 = id#3) which cannot be de-duplicated after. In this method, if there are conflicting // attributes in the join condition, the subquery's conflicting attributes are changed using // a projection which aliases them and resolves the problem. @@ -174,7 +174,7 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { val inConditions = values.zip(sub.output).map(EqualTo.tupled) // To handle a null-aware predicate not-in-subquery in nested conditions // (e.g., `v > 0 OR t1.id NOT IN (SELECT id FROM t2)`), we transform - // `inConditon` (t1.id=t2.id) into `(inCondition) OR ISNULL(inCondition)`. + // `inCondition` (t1.id=t2.id) into `(inCondition) OR ISNULL(inCondition)`. // // For example, `SELECT * FROM t1 WHERE v > 0 OR t1.id NOT IN (SELECT id FROM t2)` // is transformed into a plan below; @@ -567,7 +567,7 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] with AliasHelpe subqueryRoot = Project(projList ++ havingInputs, subqueryRoot) case s @ SubqueryAlias(alias, _) => subqueryRoot = SubqueryAlias(alias, subqueryRoot) - case op => sys.error(s"Unexpected operator $op in corelated subquery") + case op => sys.error(s"Unexpected operator $op in correlated subquery") } // CASE WHEN alwaysTrue IS NULL THEN resultOnZeroTups diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala index 1f32620e54902..948b94a7e9d66 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala @@ -127,7 +127,7 @@ object ParserUtils { } } - /** Unescape baskslash-escaped string enclosed by quotes. */ + /** Unescape backslash-escaped string enclosed by quotes. */ def unescapeSQLString(b: String): String = { var enclosure: Character = null val sb = new StringBuilder(b.length()) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala index 864ca4f57483d..e0839a34ae589 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala @@ -397,7 +397,7 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] case ar: AttributeReference if allAttributes.indexOf(ar.exprId) == -1 => // Top level `AttributeReference` may also be used for output like `Alias`, we should - // normalize the epxrId too. + // normalize the exprId too. id += 1 ar.withExprId(ExprId(id)).canonicalized diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index ad5c3fd74e9b5..1a9c9d14e3eed 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -136,7 +136,7 @@ abstract class LogicalPlan def outputOrdering: Seq[SortOrder] = Nil /** - * Returns true iff `other`'s output is semantically the same, ie.: + * Returns true iff `other`'s output is semantically the same, i.e.: * - it contains the same number of `Attribute`s; * - references are the same; * - the order is equal too. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/PlanHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/PlanHelper.scala index 63348f766a5b1..5ec488efc328c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/PlanHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/PlanHelper.scala @@ -29,7 +29,7 @@ object PlanHelper { /** * Check if there's any expression in this query plan operator that is * - A WindowExpression but the plan is not Window - * - An AggregateExpresion but the plan is not Aggregate or Window + * - An AggregateExpression but the plan is not Aggregate or Window * - A Generator but the plan is not Generate * Returns the list of invalid expressions that this operator hosts. This can happen when * 1. The input query from users contain invalid expressions. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index aa7151ad36850..0e4bfa4dc34da 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -675,7 +675,7 @@ object Expand { val numAttributes = attrMap.size assert(numAttributes <= GroupingID.dataType.defaultSize * 8) val mask = if (numAttributes != 64) (1L << numAttributes) - 1 else 0xFFFFFFFFFFFFFFFFL - // Calculate the attrbute masks of selected grouping set. For example, if we have GroupBy + // Calculate the attribute masks of selected grouping set. For example, if we have GroupBy // attributes (a, b, c, d), grouping set (a, c) will produce the following sequence: // (15, 7, 13), whose binary form is (1111, 0111, 1101) val masks = (mask +: groupingSetAttrs.map(attrMap).map(index => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala index 17e1cb416fc8a..c4002aa441a50 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala @@ -164,7 +164,7 @@ trait Partitioning { * i.e. the current dataset does not need to be re-partitioned for the `required` * Distribution (it is possible that tuples within a partition need to be reorganized). * - * A [[Partitioning]] can never satisfy a [[Distribution]] if its `numPartitions` does't match + * A [[Partitioning]] can never satisfy a [[Distribution]] if its `numPartitions` doesn't match * [[Distribution.requiredNumPartitions]]. */ final def satisfies(required: Distribution): Boolean = { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala index f02b2d08c0935..eac34c8f076a1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala @@ -226,8 +226,8 @@ private object DateTimeFormatterHelper { // string at res(0). So when the first element here is empty string we do not need append `'` // literal to the DateTimeFormatterBuilder. case ("", idx) if idx != 0 => builder.appendLiteral("'") - case (pattenPart, idx) if idx % 2 == 0 => - var rest = pattenPart + case (patternPart, idx) if idx % 2 == 0 => + var rest = patternPart while (rest.nonEmpty) { rest match { case extractor(prefix, secondFraction, suffix) => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 87cf3c93ba26e..0543ef99f8947 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -189,7 +189,7 @@ object DateTimeUtils { * precision, so this conversion is lossy. */ def microsToMillis(micros: Long): Long = { - // When the timestamp is negative i.e before 1970, we need to adjust the millseconds portion. + // When the timestamp is negative i.e before 1970, we need to adjust the milliseconds portion. // Example - 1965-01-01 10:11:12.123456 is represented as (-157700927876544) in micro precision. // In millis precision the above needs to be represented as (-157700927877). Math.floorDiv(micros, MICROS_PER_MILLIS) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala index ae7066d87d530..addf1408a33a8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala @@ -173,13 +173,13 @@ class QuantileSummaries( // Take the case of the sample `10` from `b`. In the original stream, it could have appeared // right after `0` (as expressed by `g=1`) or right before `20`, so `delta=99+0-1=98`. // In the GK algorithm's style of working in terms of maximum bounds, one can observe that the - // maximum additional uncertainty over samples comming from `b` is `max(g_a + delta_a) = + // maximum additional uncertainty over samples coming from `b` is `max(g_a + delta_a) = // floor(2 * eps_a * n_a)`. Likewise, additional uncertainty over samples from `a` is // `floor(2 * eps_b * n_b)`. // Only samples that interleave the other side are affected. That means that samples from // one side that are lesser (or greater) than all samples from the other side are just copied - // unmodifed. - // If the merging instances have different `relativeError`, the resulting instance will cary + // unmodified. + // If the merging instances have different `relativeError`, the resulting instance will carry // the largest one: `eps_ab = max(eps_a, eps_b)`. // The main invariant of the GK algorithm is kept: // `max(g_ab + delta_ab) <= floor(2 * eps_ab * (n_a + n_b))` since diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 69f04e11ff0bc..e8e1120cbb884 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1145,7 +1145,7 @@ object SQLConf { val CODEGEN_FACTORY_MODE = buildConf("spark.sql.codegen.factoryMode") .doc("This config determines the fallback behavior of several codegen generators " + - "during tests. `FALLBACK` means trying codegen first and then fallbacking to " + + "during tests. `FALLBACK` means trying codegen first and then falling back to " + "interpreted if any compile error happens. Disabling fallback if `CODEGEN_ONLY`. " + "`NO_CODEGEN` skips codegen and goes interpreted path always. Note that " + "this config works only for tests.") @@ -1570,7 +1570,7 @@ object SQLConf { val JSON_EXPRESSION_OPTIMIZATION = buildConf("spark.sql.optimizer.enableJsonExpressionOptimization") .doc("Whether to optimize JSON expressions in SQL optimizer. It includes pruning " + - "unnecessary columns from from_json, simplifing from_json + to_json, to_json + " + + "unnecessary columns from from_json, simplifying from_json + to_json, to_json + " + "named_struct(from_json.col1, from_json.col2, ....).") .version("3.1.0") .booleanConf @@ -2058,7 +2058,7 @@ object SQLConf { buildConf("spark.sql.decimalOperations.allowPrecisionLoss") .internal() .doc("When true (default), establishing the result type of an arithmetic operation " + - "happens according to Hive behavior and SQL ANSI 2011 specification, ie. rounding the " + + "happens according to Hive behavior and SQL ANSI 2011 specification, i.e. rounding the " + "decimal part of the result if an exact representation is not possible. Otherwise, NULL " + "is returned in those cases, as previously.") .version("2.3.1") diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index 9fa27c7df3832..4badcbaa89aa4 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -204,7 +204,7 @@ object RandomDataGenerator { specialDates.map(java.sql.Date.valueOf)) } case TimestampType => - def uniformMicorsRand(rand: Random): Long = { + def uniformMicrosRand(rand: Random): Long = { var milliseconds = rand.nextLong() % 253402329599999L // -62135740800000L is the number of milliseconds before January 1, 1970, 00:00:00 GMT // for "0001-01-01 00:00:00.000000". We need to find a @@ -225,7 +225,7 @@ object RandomDataGenerator { if (SQLConf.get.getConf(SQLConf.DATETIME_JAVA8API_ENABLED)) { randomNumeric[Instant]( rand, - (rand: Random) => DateTimeUtils.microsToInstant(uniformMicorsRand(rand)), + (rand: Random) => DateTimeUtils.microsToInstant(uniformMicrosRand(rand)), specialTs.map { s => val ldt = LocalDateTime.parse(s.replace(" ", "T")) ldt.atZone(ZoneId.systemDefault()).toInstant @@ -235,7 +235,7 @@ object RandomDataGenerator { rand, (rand: Random) => { // DateTimeUtils.toJavaTimestamp takes microsecond. - val ts = DateTimeUtils.toJavaTimestamp(uniformMicorsRand(rand)) + val ts = DateTimeUtils.toJavaTimestamp(uniformMicrosRand(rand)) // The generated `ts` is based on the hybrid calendar Julian + Gregorian since // 1582-10-15 but it should be valid in Proleptic Gregorian calendar too which is used // by Spark SQL since version 3.0 (see SPARK-26651). We try to convert `ts` to diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index f5bfdc5e695e0..61186c178b083 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -427,7 +427,7 @@ class AnalysisSuite extends AnalysisTest with Matchers { checkAnalysis(plan, expected) } - test("SPARK-12102: Ignore nullablity when comparing two sides of case") { + test("SPARK-12102: Ignore nullability when comparing two sides of case") { val relation = LocalRelation(Symbol("a").struct(Symbol("x").int), Symbol("b").struct(Symbol("x").int.withNullability(false))) val plan = relation.select( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveGroupingAnalyticsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveGroupingAnalyticsSuite.scala index 249e7a49a0a90..cdfae14138290 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveGroupingAnalyticsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveGroupingAnalyticsSuite.scala @@ -160,7 +160,7 @@ class ResolveGroupingAnalyticsSuite extends AnalysisTest { } test("grouping function") { - // GrouingSets + // GroupingSets val originalPlan = GroupingSets(Seq(Seq(), Seq(unresolved_a), Seq(unresolved_a, unresolved_b)), Seq(unresolved_a, unresolved_b), r1, Seq(unresolved_a, unresolved_b, UnresolvedAlias(count(unresolved_c)), @@ -200,7 +200,7 @@ class ResolveGroupingAnalyticsSuite extends AnalysisTest { } test("grouping_id") { - // GrouingSets + // GroupingSets val originalPlan = GroupingSets(Seq(Seq(), Seq(unresolved_a), Seq(unresolved_a, unresolved_b)), Seq(unresolved_a, unresolved_b), r1, Seq(unresolved_a, unresolved_b, UnresolvedAlias(count(unresolved_c)), diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala index 1e5bc271ab270..5c4d45b5394f7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala @@ -260,7 +260,7 @@ class TypeCoercionSuite extends AnalysisTest { // Tests that its not possible to setup implicit casts between two map types when // source map's key type is integer and the target map's key type are either Binary, - // Boolean, Date, Timestamp, Array, Struct, CaleandarIntervalType or NullType + // Boolean, Date, Timestamp, Array, Struct, CalendarIntervalType or NullType nonCastableTargetTypes.foreach { targetType => shouldNotCast(sourceType, targetType) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala index cdc3f4275414c..fa779477cccab 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala @@ -887,7 +887,7 @@ class UnsupportedOperationsSuite extends SparkFunSuite with SQLHelper { } } - /** Assert that the logical plan is supported for continuous procsssing mode */ + /** Assert that the logical plan is supported for continuous processing mode */ def assertSupportedForContinuousProcessing( name: String, plan: LogicalPlan, diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala index adaabfe4d32bb..bca8c56a1071e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala @@ -527,7 +527,7 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper { .exists(_.getRenderedMessage().contains("Generated method too long"))) } - test("SPARK-28916: subexrepssion elimination can cause 64kb code limit on UnsafeProjection") { + test("SPARK-28916: subexpression elimination can cause 64kb code limit on UnsafeProjection") { val numOfExprs = 10000 val exprs = (0 to numOfExprs).flatMap(colIndex => Seq(Add(BoundReference(colIndex, DoubleType, true), @@ -554,7 +554,7 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper { // Expecting result: // "((scala.math.LowPriorityOrderingImplicits$$anon$3) references[0] /* comparator */)" - // Using lenient assertions to be resilient to annonymous class numbering changes + // Using lenient assertions to be resilient to anonymous class numbering changes assert(!refTerm.contains("null")) assert(refTerm.contains("scala.math.LowPriorityOrderingImplicits$$anon$")) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala index 3d6f6937e780b..57abdb4de229f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala @@ -425,14 +425,14 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper { def checkErrorMessage( childDataType: DataType, fieldDataType: DataType, - errorMesage: String): Unit = { + errorMessage: String): Unit = { val e = intercept[org.apache.spark.sql.AnalysisException] { ExtractValue( Literal.create(null, childDataType), Literal.create(null, fieldDataType), _ == _) } - assert(e.getMessage().contains(errorMesage)) + assert(e.getMessage().contains(errorMessage)) } checkErrorMessage(structType, IntegerType, "Field name should be String Literal") diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala index 87e34aca510f5..ee6f89a155ae0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala @@ -212,8 +212,8 @@ class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper test("case key when - internal pattern matching expects a List while apply takes a Seq") { val indexedSeq = IndexedSeq(Literal(1), Literal(42), Literal(42), Literal(1)) - val caseKeyWhaen = CaseKeyWhen(Literal(12), indexedSeq) - assert(caseKeyWhaen.branches == + val caseKeyWhen = CaseKeyWhen(Literal(12), indexedSeq) + assert(caseKeyWhen.branches == IndexedSeq((Literal(12) === Literal(1), Literal(42)), (Literal(12) === Literal(42), Literal(1)))) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala index 70eb391ad6e05..26d98157807cd 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala @@ -451,7 +451,7 @@ trait ExpressionEvalHelper extends ScalaCheckDrivenPropertyChecks with PlanTestB if (interpret.isDefined && codegen.isDefined && !compareResults(interpret.get, codegen.get)) { fail(s"Incorrect evaluation: $expr, interpret: ${interpret.get}, codegen: ${codegen.get}") } else if (interpretExc.isDefined && codegenExc.isEmpty) { - fail(s"Incorrect evaluation: $expr, interpet threw exception ${interpretExc.get}") + fail(s"Incorrect evaluation: $expr, interpret threw exception ${interpretExc.get}") } else if (interpretExc.isEmpty && codegenExc.isDefined) { fail(s"Incorrect evaluation: $expr, codegen threw exception ${codegenExc.get}") } else if (interpretExc.isDefined && codegenExc.isDefined diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala index bc2b93e5390da..d425d0ba42186 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala @@ -212,9 +212,9 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { val initializeWithNonexistingMethod = InitializeJavaBean( Literal.fromObject(new java.util.LinkedList[Int]), - Map("nonexisting" -> Literal(1))) + Map("nonexistent" -> Literal(1))) checkExceptionInExpression[Exception](initializeWithNonexistingMethod, - """A method named "nonexisting" is not declared in any enclosing class """ + + """A method named "nonexistent" is not declared in any enclosing class """ + "nor any supertype") val initializeWithWrongParamType = InitializeJavaBean( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 730574a4b9846..78e9cf82a28b1 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -118,7 +118,7 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { testElt(null, 1, null, "world") testElt(null, null, "hello", "world") - // Invalid ranages + // Invalid ranges testElt(null, 3, "hello", "world") testElt(null, 0, "hello", "world") testElt(null, -1, "hello", "world") diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala index 972db7fa30a91..d6e6142b07a3f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala @@ -66,20 +66,20 @@ class PercentileSuite extends SparkFunSuite { // Test with row with frequency. Second and third columns are frequency in Int and Long val countForFrequencyTest = 1000 val rowsWithFrequency = (1 to countForFrequencyTest).map(x => Seq(x, x):+ x.toLong) - val expectedPercentilesWithFrquency = Seq(1.0, 500.0, 707.0, 866.0, 1000.0) + val expectedPercentilesWithFrequency = Seq(1.0, 500.0, 707.0, 866.0, 1000.0) val frequencyExpressionInt = BoundReference(1, IntegerType, nullable = false) val aggInt = new Percentile(childExpression, percentageExpression, frequencyExpressionInt) - runTest(aggInt, rowsWithFrequency, expectedPercentilesWithFrquency) + runTest(aggInt, rowsWithFrequency, expectedPercentilesWithFrequency) val frequencyExpressionLong = BoundReference(2, LongType, nullable = false) val aggLong = new Percentile(childExpression, percentageExpression, frequencyExpressionLong) - runTest(aggLong, rowsWithFrequency, expectedPercentilesWithFrquency) + runTest(aggLong, rowsWithFrequency, expectedPercentilesWithFrequency) // Run test with Flatten data val flattenRows = (1 to countForFrequencyTest).flatMap(current => (1 to current).map(y => current )).map(Seq(_)) - runTest(agg, flattenRows, expectedPercentilesWithFrquency) + runTest(agg, flattenRows, expectedPercentilesWithFrequency) } private def runTest(agg: Percentile, diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeBlockSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeBlockSuite.scala index d660afb7f8a05..9d4c5986300c5 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeBlockSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeBlockSuite.scala @@ -115,7 +115,7 @@ class CodeBlockSuite extends SparkFunSuite { assert(exprValues === Set(isNull1, value1, isNull2, value2, literal)) } - test("Throws exception when interpolating unexcepted object in code block") { + test("Throws exception when interpolating unexpected object in code block") { val obj = Tuple2(1, 1) val e = intercept[IllegalArgumentException] { code"$obj" diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationSuite.scala index 2eea840e21a31..8543b62fd8bdd 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SetOperationSuite.scala @@ -154,11 +154,11 @@ class SetOperationSuite extends PlanTest { .union(testRelation2.select(Literal(-1L).as("vcol"), 'd, 'e, 'f)) .groupBy('a, 'b, 'c)('a, 'b, 'c, sum('vcol).as("sum")) .where(GreaterThan('sum, Literal(0L))).analyze - val multiplerAttr = planFragment.output.last + val multiplierAttr = planFragment.output.last val output = planFragment.output.dropRight(1) val expectedPlan = Project(output, Generate( - ReplicateRows(Seq(multiplerAttr) ++ output), + ReplicateRows(Seq(multiplierAttr) ++ output), Nil, false, None, @@ -183,11 +183,11 @@ class SetOperationSuite extends PlanTest { .select('a, 'b, 'c, If(GreaterThan('vcol1_count, 'vcol2_count), 'vcol2_count, 'vcol1_count).as("min_count")) .analyze - val multiplerAttr = planFragment.output.last + val multiplierAttr = planFragment.output.last val output = planFragment.output.dropRight(1) val expectedPlan = Project(output, Generate( - ReplicateRows(Seq(multiplerAttr) ++ output), + ReplicateRows(Seq(multiplierAttr) ++ output), Nil, false, None, diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala index 0f1b4a3ea918c..e98ec6a667a73 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala @@ -958,7 +958,7 @@ class DDLParserSuite extends AnalysisTest { Some(first()))) } - test("alter table: mutiple property changes are not allowed") { + test("alter table: multiple property changes are not allowed") { intercept[ParseException] { parsePlan("ALTER TABLE table_name ALTER COLUMN a.b.c " + "TYPE bigint COMMENT 'new comment'")} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala index b9f984001523a..46ad5d1dec7e4 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala @@ -124,8 +124,8 @@ class DataTypeParserSuite extends SparkFunSuite { unsupported("struct") test("Do not print empty parentheses for no params") { - assert(intercept("unkwon").getMessage.contains("unkwon is not supported")) - assert(intercept("unkwon(1,2,3)").getMessage.contains("unkwon(1,2,3) is not supported")) + assert(intercept("unknown").getMessage.contains("unknown is not supported")) + assert(intercept("unknown(1,2,3)").getMessage.contains("unknown(1,2,3) is not supported")) } // DataType parser accepts certain reserved keywords. diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ErrorParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ErrorParserSuite.scala index 00b6828c08b38..99051d692451b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ErrorParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ErrorParserSuite.scala @@ -77,7 +77,7 @@ class ErrorParserSuite extends AnalysisTest { } test("SPARK-21136: misleading error message due to problematic antlr grammar") { - intercept("select * from a left joinn b on a.id = b.id", "missing 'JOIN' at 'joinn'") + intercept("select * from a left join_ b on a.id = b.id", "missing 'JOIN' at 'join_'") intercept("select * from test where test.t is like 'test'", "mismatched input 'is' expecting") intercept("SELECT * FROM test WHERE x NOT NULL", "mismatched input 'NOT' expecting") } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala index 9f6a76b9228c5..0b304a799cdc5 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala @@ -590,7 +590,7 @@ class ExpressionParserSuite extends AnalysisTest { // tests that have different result regarding the conf if (escape) { - // When SQLConf.ESCAPED_STRING_LITERALS is enabled, string literal parsing fallbacks to + // When SQLConf.ESCAPED_STRING_LITERALS is enabled, string literal parsing falls back to // Spark 1.6 behavior. // 'LIKE' string literals. @@ -780,7 +780,7 @@ class ExpressionParserSuite extends AnalysisTest { val complexName = FunctionIdentifier("`ba`r", Some("`fo`o")) assertEqual(complexName.quotedString, UnresolvedAttribute("`fo`o.`ba`r")) intercept(complexName.unquotedString, "mismatched input") - // Function identifier contains countious backticks should be treated correctly. + // Function identifier contains continuous backticks should be treated correctly. val complexName2 = FunctionIdentifier("ba``r", Some("fo``o")) assertEqual(complexName2.quotedString, UnresolvedAttribute("fo``o.ba``r")) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala index f037ce7b9e793..bad3e0d79dd12 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala @@ -355,7 +355,7 @@ class TableIdentifierParserSuite extends SparkFunSuite with SQLKeywordUtils { assert(complexName === parseTableIdentifier("```d``b``1`.```weird``table``name`")) assert(complexName === parseTableIdentifier(complexName.quotedString)) intercept[ParseException](parseTableIdentifier(complexName.unquotedString)) - // Table identifier contains countious backticks should be treated correctly. + // Table identifier contains continuous backticks should be treated correctly. val complexName2 = TableIdentifier("x``y", Some("d``b")) assert(complexName2 === parseTableIdentifier(complexName2.quotedString)) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/UnsafeArraySuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/UnsafeArraySuite.scala index 2e190c6ba6d4b..5729b02dc4926 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/UnsafeArraySuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/UnsafeArraySuite.scala @@ -55,7 +55,7 @@ class UnsafeArraySuite extends SparkFunSuite { BigDecimal("1.2345678901234567890123456").setScale(21, BigDecimal.RoundingMode.FLOOR), BigDecimal("2.3456789012345678901234567").setScale(21, BigDecimal.RoundingMode.FLOOR)) - val calenderintervalArray = Array( + val calendarintervalArray = Array( new CalendarInterval(3, 2, 321), new CalendarInterval(1, 2, 123)) val intMultiDimArray = Array(Array(1), Array(2, 20), Array(3, 30, 300)) @@ -142,12 +142,12 @@ class UnsafeArraySuite extends SparkFunSuite { val schema = new StructType().add("array", ArrayType(CalendarIntervalType)) val encoder = RowEncoder(schema).resolveAndBind() - val externalRow = Row(calenderintervalArray) + val externalRow = Row(calendarintervalArray) val ir = encoder.createSerializer().apply(externalRow) val unsafeCalendar = ir.getArray(0) assert(unsafeCalendar.isInstanceOf[UnsafeArrayData]) - assert(unsafeCalendar.numElements == calenderintervalArray.length) - calenderintervalArray.zipWithIndex.map { case (e, i) => + assert(unsafeCalendar.numElements == calendarintervalArray.length) + calendarintervalArray.zipWithIndex.map { case (e, i) => assert(unsafeCalendar.getInterval(i) == e) } diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/ColumnDescriptor.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/ColumnDescriptor.java index d8e61a87e7f62..b2ef1c7722ef8 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/ColumnDescriptor.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/ColumnDescriptor.java @@ -49,7 +49,7 @@ public ColumnDescriptor(TColumnDesc tColumnDesc) { public static ColumnDescriptor newPrimitiveColumnDescriptor(String name, String comment, Type type, int position) { // Current usage looks like it's only for metadata columns, but if that changes then - // this method may need to require a type qualifiers aruments. + // this method may need to require a type qualifiers arguments. return new ColumnDescriptor(name, comment, new TypeDescriptor(type), position); } diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/GetInfoValue.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/GetInfoValue.java index 2b2359cc13c0f..bf3c6b27ea81d 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/GetInfoValue.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/GetInfoValue.java @@ -52,7 +52,7 @@ public GetInfoValue(TGetInfoValue tGetInfoValue) { stringValue = tGetInfoValue.getStringValue(); break; default: - throw new IllegalArgumentException("Unreconigzed TGetInfoValue"); + throw new IllegalArgumentException("Unrecognized TGetInfoValue"); } } diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/GetColumnsOperation.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/GetColumnsOperation.java index c25c742d392b3..59630672847e4 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/GetColumnsOperation.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/GetColumnsOperation.java @@ -103,7 +103,7 @@ public class GetColumnsOperation extends MetadataOperation { "Schema of table that is the scope of a reference attribute " + "(null if the DATA_TYPE isn't REF)") .addPrimitiveColumn("SCOPE_TABLE", Type.STRING_TYPE, - "Table name that this the scope of a reference attribure " + "Table name that this the scope of a reference attribute " + "(null if the DATA_TYPE isn't REF)") .addPrimitiveColumn("SOURCE_DATA_TYPE", Type.SMALLINT_TYPE, "Source type of a distinct type or user-generated Ref type, " diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java index 1b3e8fe6bfb9d..f47a4388f7bea 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java @@ -175,9 +175,9 @@ protected BufferedReader loadFile(String fileName) throws IOException { @Override protected int processCmd(String cmd) { int rc = 0; - String cmd_trimed = cmd.trim(); + String cmd_trimmed = cmd.trim(); try { - executeStatementInternal(cmd_trimed, null, false, 0); + executeStatementInternal(cmd_trimmed, null, false, 0); } catch (HiveSQLException e) { rc = -1; LOG.warn("Failed to execute HQL command in global .hiverc file.", e); diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java index ab9ed5b1f371e..13fc552a9a42e 100644 --- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java +++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java @@ -137,7 +137,7 @@ protected void initializeServer() { httpServer.setHandler(context); context.addServlet(new ServletHolder(thriftHttpServlet), httpPath); - // TODO: check defaults: maxTimeout, keepalive, maxBodySize, bodyRecieveDuration, etc. + // TODO: check defaults: maxTimeout, keepalive, maxBodySize, bodyReceiveDuration, etc. // Finally, start the server httpServer.start(); // In case HIVE_SERVER2_THRIFT_HTTP_PORT or hive.server2.thrift.http.port is configured with diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/DummyListeners.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/DummyListeners.scala index 4564c2209a931..820859b65925b 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/DummyListeners.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/DummyListeners.scala @@ -17,7 +17,7 @@ /** * These classes in this package are intentionally placed to the outer package of spark, - * because IsolatedClientLoader leverages Spark classloader for shared classess including + * because IsolatedClientLoader leverages Spark classloader for shared classes including * spark package, and the test should fail if Spark initializes these listeners with * IsolatedClientLoader. */ diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnvSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnvSuite.scala index f28faea2be868..f2bb337e4a826 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnvSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnvSuite.scala @@ -42,7 +42,7 @@ class SparkSQLEnvSuite extends SparkFunSuite { QUERY_EXECUTION_LISTENERS.key -> classOf[DummyQueryExecutionListener].getCanonicalName, STREAMING_QUERY_LISTENERS.key -> classOf[DummyStreamingQueryListener].getCanonicalName, WAREHOUSE_PATH.key -> TestHiveContext.makeWarehouseDir().toURI.getPath, - // The issue occured from "maven" and list of custom jars, but providing list of custom + // The issue occurred from "maven" and list of custom jars, but providing list of custom // jars to initialize HiveClient isn't trivial, so just use "maven". HIVE_METASTORE_JARS.key -> "maven", HIVE_METASTORE_VERSION.key -> null, diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index 4ce1964a19bd9..c263932c2f535 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -56,7 +56,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true) // Ensures that cross joins are enabled so that we can test them TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, true) - // Ensures that the table insertion behaivor is consistent with Hive + // Ensures that the table insertion behavior is consistent with Hive TestHive.setConf(SQLConf.STORE_ASSIGNMENT_POLICY, StoreAssignmentPolicy.LEGACY.toString) // Fix session local timezone to America/Los_Angeles for those timezone sensitive tests // (timestamp_*) @@ -305,7 +305,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { // Unsupported underscore syntax. "inputddl5", - // Thift is broken... + // Thrift is broken... "inputddl8", // Hive changed ordering of ddl: @@ -496,7 +496,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "drop_partitions_filter2", "drop_partitions_filter3", - // The following failes due to truncate table + // The following fails due to truncate table "truncate_table", // We do not support DFS command. @@ -716,7 +716,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "groupby_multi_insert_common_distinct", "groupby_multi_single_reducer2", "groupby_multi_single_reducer3", - "groupby_mutli_insert_common_distinct", + "groupby_multi_insert_common_distinct", "groupby_neg_float", "groupby_ppd", "groupby_ppr", @@ -958,8 +958,8 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "subq2", "subquery_exists", "subquery_exists_having", - "subquery_notexists", - "subquery_notexists_having", + "subquery_nonexistent", + "subquery_nonexistent_having", "subquery_in_having", "tablename_with_select", "timestamp_comparison", diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index a89243c331c7b..e02589e5cad00 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -332,7 +332,7 @@ private[hive] object HiveMetastoreCatalog { metastoreSchema: StructType, inferredSchema: StructType): StructType = try { // scalastyle:off caselocale - // Find any nullable fields in mestastore schema that are missing from the inferred schema. + // Find any nullable fields in metastore schema that are missing from the inferred schema. val metastoreFields = metastoreSchema.map(f => f.name.toLowerCase -> f).toMap val missingNullables = metastoreFields .filterKeys(!inferredSchema.map(_.name.toLowerCase).contains(_)) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index 34befb8a6f965..b4ebf153fc178 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -367,14 +367,14 @@ private[hive] class HiveClientImpl( override def getDatabase(dbName: String): CatalogDatabase = withHiveState { Option(client.getDatabase(dbName)).map { d => - val paras = Option(d.getParameters).map(_.asScala.toMap).getOrElse(Map()) ++ + val params = Option(d.getParameters).map(_.asScala.toMap).getOrElse(Map()) ++ Map(PROP_OWNER -> shim.getDatabaseOwnerName(d)) CatalogDatabase( name = d.getName, description = Option(d.getDescription).getOrElse(""), locationUri = CatalogUtils.stringToURI(d.getLocationUri), - properties = paras) + properties = params) }.getOrElse(throw new NoSuchDatabaseException(dbName)) } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationExec.scala index 4096916a100c3..26baff3d83eec 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationExec.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationExec.scala @@ -92,7 +92,7 @@ case class HiveScriptTransformationExec( scriptOutputWritable.readFields(scriptOutputStream) } catch { case _: EOFException => - // This means that the stdout of `proc` (ie. TRANSFORM process) has exhausted. + // This means that the stdout of `proc` (i.e. TRANSFORM process) has exhausted. // Ideally the proc should *not* be alive at this point but // there can be a lag between EOF being written out and the process // being terminated. So explicitly waiting for the process to be done. diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala index 3c3f31ac2994a..63e46880376e1 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala @@ -172,7 +172,7 @@ case class InsertIntoHiveTable( table.bucketSpec match { case Some(bucketSpec) => // Writes to bucketed hive tables are allowed only if user does not care about maintaining - // table's bucketing ie. both "hive.enforce.bucketing" and "hive.enforce.sorting" are + // table's bucketing i.e. both "hive.enforce.bucketing" and "hive.enforce.sorting" are // set to false val enforceBucketingConfig = "hive.enforce.bucketing" val enforceSortingConfig = "hive.enforce.sorting" diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala index cd07199e48ed7..3fa8449c3cb01 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.execution.datasources.DataSourceStrategy * the hive table relation will be updated based on pruned partitions. * * This rule is executed in optimization phase, so the statistics can be updated before physical - * planning, which is useful for some spark strategy, eg. + * planning, which is useful for some spark strategy, e.g. * [[org.apache.spark.sql.execution.SparkStrategies.JoinSelection]]. * * TODO: merge this with PruneFileSourcePartitions after we completely make hive as a data source. diff --git a/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/auto_sortmerge_join_13.q b/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/auto_sortmerge_join_13.q index 28bbc2d8f1a3e..df5334c785f6a 100644 --- a/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/auto_sortmerge_join_13.q +++ b/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/auto_sortmerge_join_13.q @@ -19,7 +19,7 @@ set hive.optimize.bucketmapjoin = true; set hive.optimize.bucketmapjoin.sortedmerge = true; set hive.auto.convert.join=true; --- A SMB join followed by a mutli-insert +-- A SMB join followed by a multi-insert explain from ( SELECT a.key key1, a.value value1, b.key key2, b.value value2 @@ -41,7 +41,7 @@ select * from dest2 order by k1, k2; set hive.auto.convert.join.noconditionaltask=true; set hive.auto.convert.join.noconditionaltask.size=200; --- A SMB join followed by a mutli-insert +-- A SMB join followed by a multi-insert explain from ( SELECT a.key key1, a.value value1, b.key key2, b.value value2 @@ -61,7 +61,7 @@ select * from dest1 order by k1, k2; select * from dest2 order by k1, k2; set hive.auto.convert.sortmerge.join.to.mapjoin=true; --- A SMB join followed by a mutli-insert +-- A SMB join followed by a multi-insert explain from ( SELECT a.key key1, a.value value1, b.key key2, b.value value2 diff --git a/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/bucketsortoptimize_insert_3.q b/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/bucketsortoptimize_insert_3.q index 91e97de62c82f..843ba4a3dbacd 100644 --- a/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/bucketsortoptimize_insert_3.q +++ b/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/bucketsortoptimize_insert_3.q @@ -18,7 +18,7 @@ FROM src INSERT OVERWRITE TABLE test_table1 PARTITION (ds = '1') SELECT *; -- Insert data into the bucketed table by selecting from another bucketed table --- The bucketing positions dont match - although the actual bucketing do. +-- The bucketing positions don't match - although the actual bucketing do. -- This should be a map-only operation EXPLAIN INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') @@ -37,7 +37,7 @@ CREATE TABLE test_table3 (key INT, value STRING) PARTITIONED BY (ds STRING) CLUSTERED BY (value) SORTED BY (value) INTO 2 BUCKETS; -- Insert data into the bucketed table by selecting from another bucketed table --- The bucketing positions dont match - this should be a map-reduce operation +-- The bucketing positions don't match - this should be a map-reduce operation EXPLAIN INSERT OVERWRITE TABLE test_table2 PARTITION (ds = '1') SELECT x.key, x.value from diff --git a/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/smb_mapjoin_20.q b/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/smb_mapjoin_20.q index f70e7d5c86237..4c56cad2411fc 100644 --- a/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/smb_mapjoin_20.q +++ b/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/smb_mapjoin_20.q @@ -32,7 +32,7 @@ CREATE TABLE test_table3 (key STRING, value1 int, value2 string) PARTITIONED BY CLUSTERED BY (value1) SORTED BY (value1) INTO 2 BUCKETS; -- Insert data into the bucketed table by selecting from another bucketed table --- This should be a map-only operation, although the bucketing positions dont match +-- This should be a map-only operation, although the bucketing positions don't match EXPLAIN INSERT OVERWRITE TABLE test_table3 PARTITION (ds = '1') SELECT a.value, a.key, a.value FROM test_table1 a WHERE a.ds = '1'; diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala index 71750e6b3a516..b715f484fa02a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala @@ -35,7 +35,7 @@ import org.apache.spark.util.Utils case class TestData(key: Int, value: String) -case class ThreeCloumntable(key: Int, value: String, key1: String) +case class ThreeColumnTable(key: Int, value: String, key1: String) class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter with SQLTestUtils with PrivateMethodTester { @@ -764,7 +764,7 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter val path = dir.toURI.getPath val e = intercept[AnalysisException] { - sql(s"INSERT OVERWRITE LOCAL DIRECTORY '${path}' TABLE notexists") + sql(s"INSERT OVERWRITE LOCAL DIRECTORY '${path}' TABLE nonexistent") }.getMessage assert(e.contains("Table or view not found")) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index 2ea98943011f4..2e98a76c52488 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -735,7 +735,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto } } - test("analyze column command paramaters validation") { + test("analyze column command parameters validation") { val e1 = intercept[IllegalArgumentException] { AnalyzeColumnCommand(TableIdentifier("test"), Option(Seq("c1")), true).run(spark) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala index 1018ae5b68895..0876709c31899 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala @@ -372,7 +372,7 @@ class HiveScriptTransformationSuite extends BaseScriptTransformationSuite with T } } - test("SPARK-32400: TRANSFORM doesn't support CalenderIntervalType/UserDefinedType (hive serde)") { + test("SPARK-32400: TRANSFORM doesn't support CalendarIntervalType/UserDefinedType (hive serde)") { assume(TestUtils.testCommandAvailable("/bin/bash")) withTempView("v") { val df = Seq( @@ -410,7 +410,7 @@ class HiveScriptTransformationSuite extends BaseScriptTransformationSuite with T } test("SPARK-32400: TRANSFORM doesn't support" + - " CalenderIntervalType/UserDefinedType end to end (hive serde)") { + " CalendarIntervalType/UserDefinedType end to end (hive serde)") { assume(TestUtils.testCommandAvailable("/bin/bash")) withTempView("v") { val df = Seq( diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 6b82b1267bc66..3370695245fd0 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -228,7 +228,7 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi checkAnswer(sql(s"SHOW functions $db.temp_abs"), Row("temp_abs")) checkAnswer(sql(s"SHOW functions `$db`.`temp_abs`"), Row("temp_abs")) checkAnswer(sql(s"SHOW functions `$db`.`temp_abs`"), Row("temp_abs")) - checkAnswer(sql("SHOW functions `a function doens't exist`"), Nil) + checkAnswer(sql("SHOW functions `a function doesn't exist`"), Nil) checkAnswer(sql("SHOW functions `temp_weekofyea*`"), Row("temp_weekofyear")) // this probably will failed if we add more function with `sha` prefixing. @@ -768,7 +768,7 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi sql("SELECT * FROM nested").collect().toSeq) intercept[AnalysisException] { - sql("CREATE TABLE test_ctas_1234 AS SELECT * from notexists").collect() + sql("CREATE TABLE test_ctas_1234 AS SELECT * from nonexistent").collect() } } } @@ -1739,12 +1739,12 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi |SELECT 'blarr' """.stripMargin) - // project list is the same order of paritioning columns in table definition + // project list is the same order of partitioning columns in table definition checkAnswer( sql(s"SELECT p1, p2, p3, p4, p5, c1 FROM $table"), Row("a", "b", "c", "d", "e", "blarr") :: Nil) - // project list does not have the same order of paritioning columns in table definition + // project list does not have the same order of partitioning columns in table definition checkAnswer( sql(s"SELECT p2, p3, p4, p1, p5, c1 FROM $table"), Row("b", "c", "d", "a", "e", "blarr") :: Nil) From 6aff215077e2cdf9cec187c827da63c067514e4e Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Mon, 7 Dec 2020 10:50:31 -0800 Subject: [PATCH 132/150] [SPARK-33693][SQL] deprecate spark.sql.hive.convertCTAS ### What changes were proposed in this pull request? This is a followup of https://github.com/apache/spark/pull/30554 . Now we have a new config for converting CREATE TABLE, we don't need the old config that only works for CTAS. ### Why are the changes needed? It's confusing for having two config while one can cover another completely. ### Does this PR introduce _any_ user-facing change? no, it's deprecating not removing. ### How was this patch tested? N/A Closes #30651 from cloud-fan/minor. Authored-by: Wenchen Fan Signed-off-by: Dongjoon Hyun --- .../main/scala/org/apache/spark/sql/internal/SQLConf.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index e8e1120cbb884..bc62213bdb740 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -3005,7 +3005,9 @@ object SQLConf { s"Use '${ADVISORY_PARTITION_SIZE_IN_BYTES.key}' instead of it."), DeprecatedConfig(OPTIMIZER_METADATA_ONLY.key, "3.0", "Avoid to depend on this optimization to prevent a potential correctness issue. " + - "If you must use, use 'SparkSessionExtensions' instead to inject it as a custom rule.") + "If you must use, use 'SparkSessionExtensions' instead to inject it as a custom rule."), + DeprecatedConfig(CONVERT_CTAS.key, "3.1", + s"Set '${LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT.key}' to false instead.") ) Map(configs.map { cfg => cfg.key -> cfg } : _*) From c0874ba9f13b9802eef4418490020692e37652ba Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Mon, 7 Dec 2020 13:35:37 -0800 Subject: [PATCH 133/150] [SPARK-33480][SQL][FOLLOWUP] do not expose user data in error message ### What changes were proposed in this pull request? This is a followup of https://github.com/apache/spark/pull/30412. This PR updates the error message of char/varchar table insertion length check, to not expose user data. ### Why are the changes needed? This is risky to expose user data in the error message, especially the string data, as it may contain sensitive data. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? updated tests Closes #30653 from cloud-fan/minor2. Authored-by: Wenchen Fan Signed-off-by: Dongjoon Hyun --- .../sql/catalyst/util/CharVarcharUtils.scala | 6 ++--- .../spark/sql/CharVarcharTestSuite.scala | 26 +++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala index b551d9699f360..e42e384e4b86b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala @@ -183,9 +183,9 @@ object CharVarcharUtils extends Logging { private def raiseError(expr: Expression, typeName: String, length: Int): Expression = { val errorMsg = Concat(Seq( - Literal("input string '"), - expr, - Literal(s"' exceeds $typeName type length limitation: $length"))) + Literal("input string of length "), + Cast(Length(expr), StringType), + Literal(s" exceeds $typeName type length limitation: $length"))) Cast(RaiseError(errorMsg), StringType) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala index fcd334be7a6f7..b0f1198e46440 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala @@ -190,7 +190,7 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils { checkAnswer(spark.table("t"), Row(null)) val e = intercept[SparkException](sql("INSERT INTO t VALUES ('123456')")) assert(e.getCause.getMessage.contains( - s"input string '123456' exceeds $typeName type length limitation: 5")) + s"input string of length 6 exceeds $typeName type length limitation: 5")) } } @@ -203,7 +203,7 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils { checkAnswer(spark.table("t"), Row(1, null)) val e = intercept[SparkException](sql("INSERT INTO t VALUES (1, '123456')")) assert(e.getCause.getMessage.contains( - s"input string '123456' exceeds $typeName type length limitation: 5")) + s"input string of length 6 exceeds $typeName type length limitation: 5")) } } } @@ -215,7 +215,7 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils { checkAnswer(spark.table("t"), Row(Row(null))) val e = intercept[SparkException](sql("INSERT INTO t SELECT struct('123456')")) assert(e.getCause.getMessage.contains( - s"input string '123456' exceeds $typeName type length limitation: 5")) + s"input string of length 6 exceeds $typeName type length limitation: 5")) } } @@ -226,7 +226,7 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils { checkAnswer(spark.table("t"), Row(Seq(null))) val e = intercept[SparkException](sql("INSERT INTO t VALUES (array('a', '123456'))")) assert(e.getCause.getMessage.contains( - s"input string '123456' exceeds $typeName type length limitation: 5")) + s"input string of length 6 exceeds $typeName type length limitation: 5")) } } @@ -235,7 +235,7 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils { sql(s"CREATE TABLE t(c MAP<$typeName(5), STRING>) USING $format") val e = intercept[SparkException](sql("INSERT INTO t VALUES (map('123456', 'a'))")) assert(e.getCause.getMessage.contains( - s"input string '123456' exceeds $typeName type length limitation: 5")) + s"input string of length 6 exceeds $typeName type length limitation: 5")) } } @@ -246,7 +246,7 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils { checkAnswer(spark.table("t"), Row(Map("a" -> null))) val e = intercept[SparkException](sql("INSERT INTO t VALUES (map('a', '123456'))")) assert(e.getCause.getMessage.contains( - s"input string '123456' exceeds $typeName type length limitation: 5")) + s"input string of length 6 exceeds $typeName type length limitation: 5")) } } @@ -255,10 +255,10 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils { sql(s"CREATE TABLE t(c MAP<$typeName(5), $typeName(5)>) USING $format") val e1 = intercept[SparkException](sql("INSERT INTO t VALUES (map('123456', 'a'))")) assert(e1.getCause.getMessage.contains( - s"input string '123456' exceeds $typeName type length limitation: 5")) + s"input string of length 6 exceeds $typeName type length limitation: 5")) val e2 = intercept[SparkException](sql("INSERT INTO t VALUES (map('a', '123456'))")) assert(e2.getCause.getMessage.contains( - s"input string '123456' exceeds $typeName type length limitation: 5")) + s"input string of length 6 exceeds $typeName type length limitation: 5")) } } @@ -269,7 +269,7 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils { checkAnswer(spark.table("t"), Row(Row(Seq(null)))) val e = intercept[SparkException](sql("INSERT INTO t SELECT struct(array('123456'))")) assert(e.getCause.getMessage.contains( - s"input string '123456' exceeds $typeName type length limitation: 5")) + s"input string of length 6 exceeds $typeName type length limitation: 5")) } } @@ -280,7 +280,7 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils { checkAnswer(spark.table("t"), Row(Seq(Row(null)))) val e = intercept[SparkException](sql("INSERT INTO t VALUES (array(struct('123456')))")) assert(e.getCause.getMessage.contains( - s"input string '123456' exceeds $typeName type length limitation: 5")) + s"input string of length 6 exceeds $typeName type length limitation: 5")) } } @@ -291,7 +291,7 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils { checkAnswer(spark.table("t"), Row(Seq(Seq(null)))) val e = intercept[SparkException](sql("INSERT INTO t VALUES (array(array('123456')))")) assert(e.getCause.getMessage.contains( - s"input string '123456' exceeds $typeName type length limitation: 5")) + s"input string of length 6 exceeds $typeName type length limitation: 5")) } } @@ -313,10 +313,10 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils { checkAnswer(spark.table("t"), Row("1234 ", "1234")) val e1 = intercept[SparkException](sql("INSERT INTO t VALUES (123456, 1)")) assert(e1.getCause.getMessage.contains( - "input string '123456' exceeds char type length limitation: 5")) + "input string of length 6 exceeds char type length limitation: 5")) val e2 = intercept[SparkException](sql("INSERT INTO t VALUES (1, 123456)")) assert(e2.getCause.getMessage.contains( - "input string '123456' exceeds varchar type length limitation: 5")) + "input string of length 6 exceeds varchar type length limitation: 5")) } } From 02508b68ecc56658a13d89bf798c5ef824ba2cdc Mon Sep 17 00:00:00 2001 From: Anton Okolnychyi Date: Mon, 7 Dec 2020 15:32:10 -0800 Subject: [PATCH 134/150] [SPARK-33621][SQL] Add a way to inject data source rewrite rules ### What changes were proposed in this pull request? This PR adds a way to inject data source rewrite rules. ### Why are the changes needed? Right now `SparkSessionExtensions` allow us to inject optimization rules but they are added to operator optimization batch. There are cases when users need to run rules after the operator optimization batch (e.g. cases when a rule relies on the fact that expressions have been optimized). Currently, this is not possible. ### Does this PR introduce _any_ user-facing change? Yes. ### How was this patch tested? This PR comes with a new test. Closes #30577 from aokolnychyi/spark-33621-v3. Authored-by: Anton Okolnychyi Signed-off-by: Dongjoon Hyun --- .../spark/sql/SparkSessionExtensions.scala | 16 ++++++++++++++++ .../sql/internal/BaseSessionStateBuilder.scala | 4 +++- .../spark/sql/SparkSessionExtensionSuite.scala | 6 ++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSessionExtensions.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSessionExtensions.scala index 6952f4bfd0566..d5d969032a5e1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSessionExtensions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSessionExtensions.scala @@ -40,6 +40,7 @@ import org.apache.spark.sql.execution.{ColumnarRule, SparkPlan} *

    • Analyzer Rules.
    • *
    • Check Analysis Rules.
    • *
    • Optimizer Rules.
    • + *
    • Data Source Rewrite Rules.
    • *
    • Planning Strategies.
    • *
    • Customized Parser.
    • *
    • (External) Catalog listeners.
    • @@ -199,6 +200,21 @@ class SparkSessionExtensions { optimizerRules += builder } + private[this] val dataSourceRewriteRules = mutable.Buffer.empty[RuleBuilder] + + private[sql] def buildDataSourceRewriteRules(session: SparkSession): Seq[Rule[LogicalPlan]] = { + dataSourceRewriteRules.map(_.apply(session)).toSeq + } + + /** + * Inject an optimizer `Rule` builder that rewrites data source plans into the [[SparkSession]]. + * The injected rules will be executed after the operator optimization batch and before rules + * that depend on stats. + */ + def injectDataSourceRewriteRule(builder: RuleBuilder): Unit = { + dataSourceRewriteRules += builder + } + private[this] val plannerStrategyBuilders = mutable.Buffer.empty[StrategyBuilder] private[sql] def buildPlannerStrategies(session: SparkSession): Seq[Strategy] = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala index 8101f9e291b44..f51ee11091d02 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala @@ -273,7 +273,9 @@ abstract class BaseSessionStateBuilder( * * Note that this may NOT depend on the `optimizer` function. */ - protected def customDataSourceRewriteRules: Seq[Rule[LogicalPlan]] = Nil + protected def customDataSourceRewriteRules: Seq[Rule[LogicalPlan]] = { + extensions.buildDataSourceRewriteRules(session) + } /** * Planner that converts optimized logical plans to physical plans. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala index ea276bcec0f78..576ad26505d27 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala @@ -88,6 +88,12 @@ class SparkSessionExtensionSuite extends SparkFunSuite { } } + test("SPARK-33621: inject data source rewrite rule") { + withSession(Seq(_.injectDataSourceRewriteRule(MyRule))) { session => + assert(session.sessionState.optimizer.dataSourceRewriteRules.contains(MyRule(session))) + } + } + test("inject spark planner strategy") { withSession(Seq(_.injectPlannerStrategy(MySparkStrategy))) { session => assert(session.sessionState.planner.strategies.contains(MySparkStrategy(session))) From e4d1c10760800563d2a30410b46e5b0cd2671c4d Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Tue, 8 Dec 2020 09:35:36 +0800 Subject: [PATCH 135/150] [SPARK-32320][PYSPARK] Remove mutable default arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is bad practice, and might lead to unexpected behaviour: https://florimond.dev/blog/articles/2018/08/python-mutable-defaults-are-the-source-of-all-evil/ ``` fokkodriesprongFan spark % grep -R "={}" python | grep def python/pyspark/resource/profile.py: def __init__(self, _java_resource_profile=None, _exec_req={}, _task_req={}): python/pyspark/sql/functions.py:def from_json(col, schema, options={}): python/pyspark/sql/functions.py:def to_json(col, options={}): python/pyspark/sql/functions.py:def schema_of_json(json, options={}): python/pyspark/sql/functions.py:def schema_of_csv(csv, options={}): python/pyspark/sql/functions.py:def to_csv(col, options={}): python/pyspark/sql/functions.py:def from_csv(col, schema, options={}): python/pyspark/sql/avro/functions.py:def from_avro(data, jsonFormatSchema, options={}): ``` ``` fokkodriesprongFan spark % grep -R "=\[\]" python | grep def python/pyspark/ml/tuning.py: def __init__(self, bestModel, avgMetrics=[], subModels=None): python/pyspark/ml/tuning.py: def __init__(self, bestModel, validationMetrics=[], subModels=None): ``` ### What changes were proposed in this pull request? Removing the mutable default arguments. ### Why are the changes needed? Removing the mutable default arguments, and changing the signature to `Optional[...]`. ### Does this PR introduce _any_ user-facing change? No 👍 ### How was this patch tested? Using the Flake8 bugbear code analysis plugin. Closes #29122 from Fokko/SPARK-32320. Authored-by: Fokko Driesprong Signed-off-by: Ruifeng Zheng --- dev/sparktestsupport/modules.py | 9 +++++---- dev/tox.ini | 2 +- python/mypy.ini | 2 ++ python/pyspark/ml/regression.py | 4 ++-- python/pyspark/ml/tuning.py | 8 ++++---- python/pyspark/ml/tuning.pyi | 4 ++-- python/pyspark/resource/profile.py | 6 +++--- python/pyspark/resource/profile.pyi | 6 +++--- python/pyspark/sql/avro/functions.py | 4 ++-- python/pyspark/sql/avro/functions.pyi | 4 ++-- python/pyspark/sql/functions.py | 18 ++++++++++-------- python/pyspark/sql/functions.pyi | 12 ++++++------ 12 files changed, 42 insertions(+), 37 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 5d8b714711774..87bfbdf64a49f 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -31,9 +31,10 @@ class Module(object): files have changed. """ - def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), environ={}, - sbt_test_goals=(), python_test_goals=(), excluded_python_implementations=(), - test_tags=(), should_run_r_tests=False, should_run_build_tests=False): + def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), + environ=None, sbt_test_goals=(), python_test_goals=(), + excluded_python_implementations=(), test_tags=(), should_run_r_tests=False, + should_run_build_tests=False): """ Define a new module. @@ -62,7 +63,7 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags= self.source_file_prefixes = source_file_regexes self.sbt_test_goals = sbt_test_goals self.build_profile_flags = build_profile_flags - self.environ = environ + self.environ = environ or {} self.python_test_goals = python_test_goals self.excluded_python_implementations = excluded_python_implementations self.test_tags = test_tags diff --git a/dev/tox.ini b/dev/tox.ini index 7edf7d597fb58..43cd5877dfdb8 100644 --- a/dev/tox.ini +++ b/dev/tox.ini @@ -19,6 +19,6 @@ max-line-length=100 exclude=python/pyspark/cloudpickle/*.py,shared.py,python/docs/source/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/* [flake8] -select = E901,E999,F821,F822,F823,F401,F405 +select = E901,E999,F821,F822,F823,F401,F405,B006 exclude = python/pyspark/cloudpickle/*.py,shared.py*,python/docs/source/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/*,python/out,python/pyspark/sql/pandas/functions.pyi,python/pyspark/sql/column.pyi,python/pyspark/worker.pyi,python/pyspark/java_gateway.pyi max-line-length = 100 diff --git a/python/mypy.ini b/python/mypy.ini index 5103452a053be..ad4fcf7f317f0 100644 --- a/python/mypy.ini +++ b/python/mypy.ini @@ -102,6 +102,8 @@ disallow_untyped_defs = False ; Ignore errors in embedded third party code +no_implicit_optional = True + [mypy-pyspark.cloudpickle.*] ignore_errors = True diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index d37654a7388f5..8ecb68458ffbc 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -1801,7 +1801,7 @@ class AFTSurvivalRegression(_JavaRegressor, _AFTSurvivalRegressionParams, @keyword_only def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction", fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", - quantileProbabilities=list([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]), + quantileProbabilities=list([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]), # noqa: B005 quantilesCol=None, aggregationDepth=2, maxBlockSizeInMB=0.0): """ __init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \ @@ -1819,7 +1819,7 @@ def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="p @since("1.6.0") def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction", fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", - quantileProbabilities=list([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]), + quantileProbabilities=list([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]), # noqa: B005 quantilesCol=None, aggregationDepth=2, maxBlockSizeInMB=0.0): """ setParams(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \ diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 2c083182de470..2bddfe822f29e 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -835,13 +835,13 @@ class CrossValidatorModel(Model, _CrossValidatorParams, MLReadable, MLWritable): .. versionadded:: 1.4.0 """ - def __init__(self, bestModel, avgMetrics=[], subModels=None): + def __init__(self, bestModel, avgMetrics=None, subModels=None): super(CrossValidatorModel, self).__init__() #: best model from cross validation self.bestModel = bestModel #: Average cross-validation metrics for each paramMap in #: CrossValidator.estimatorParamMaps, in the corresponding order. - self.avgMetrics = avgMetrics + self.avgMetrics = avgMetrics or [] #: sub model list from cross validation self.subModels = subModels @@ -1323,12 +1323,12 @@ class TrainValidationSplitModel(Model, _TrainValidationSplitParams, MLReadable, .. versionadded:: 2.0.0 """ - def __init__(self, bestModel, validationMetrics=[], subModels=None): + def __init__(self, bestModel, validationMetrics=None, subModels=None): super(TrainValidationSplitModel, self).__init__() #: best model from train validation split self.bestModel = bestModel #: evaluated validation metrics - self.validationMetrics = validationMetrics + self.validationMetrics = validationMetrics or [] #: sub models from train validation split self.subModels = subModels diff --git a/python/pyspark/ml/tuning.pyi b/python/pyspark/ml/tuning.pyi index e5f153d49e9c6..912abd4d7124a 100644 --- a/python/pyspark/ml/tuning.pyi +++ b/python/pyspark/ml/tuning.pyi @@ -104,7 +104,7 @@ class CrossValidatorModel( def __init__( self, bestModel: Model, - avgMetrics: List[float] = ..., + avgMetrics: Optional[List[float]] = ..., subModels: Optional[List[List[Model]]] = ..., ) -> None: ... def copy(self, extra: Optional[ParamMap] = ...) -> CrossValidatorModel: ... @@ -171,7 +171,7 @@ class TrainValidationSplitModel( def __init__( self, bestModel: Model, - validationMetrics: List[float] = ..., + validationMetrics: Optional[List[float]] = ..., subModels: Optional[List[Model]] = ..., ) -> None: ... def setEstimator(self, value: Estimator) -> TrainValidationSplitModel: ... diff --git a/python/pyspark/resource/profile.py b/python/pyspark/resource/profile.py index 1c59a1c4a123c..38a68bc74d97e 100644 --- a/python/pyspark/resource/profile.py +++ b/python/pyspark/resource/profile.py @@ -34,13 +34,13 @@ class ResourceProfile(object): This API is evolving. """ - def __init__(self, _java_resource_profile=None, _exec_req={}, _task_req={}): + def __init__(self, _java_resource_profile=None, _exec_req=None, _task_req=None): if _java_resource_profile is not None: self._java_resource_profile = _java_resource_profile else: self._java_resource_profile = None - self._executor_resource_requests = _exec_req - self._task_resource_requests = _task_req + self._executor_resource_requests = _exec_req or {} + self._task_resource_requests = _task_req or {} @property def id(self): diff --git a/python/pyspark/resource/profile.pyi b/python/pyspark/resource/profile.pyi index 04838692436df..c8f23a5cac370 100644 --- a/python/pyspark/resource/profile.pyi +++ b/python/pyspark/resource/profile.pyi @@ -22,7 +22,7 @@ from pyspark.resource.requests import ( # noqa: F401 TaskResourceRequest as TaskResourceRequest, TaskResourceRequests as TaskResourceRequests, ) -from typing import overload, Dict, Union +from typing import overload, Dict, Union, Optional from py4j.java_gateway import JavaObject # type: ignore[import] class ResourceProfile: @@ -35,8 +35,8 @@ class ResourceProfile: def __init__( self, _java_resource_profile: None = ..., - _exec_req: Dict[str, ExecutorResourceRequest] = ..., - _task_req: Dict[str, TaskResourceRequest] = ..., + _exec_req: Optional[Dict[str, ExecutorResourceRequest]] = ..., + _task_req: Optional[Dict[str, TaskResourceRequest]] = ..., ) -> None: ... @property def id(self) -> int: ... diff --git a/python/pyspark/sql/avro/functions.py b/python/pyspark/sql/avro/functions.py index ce322814e34f8..7e4ceb20cd2c4 100644 --- a/python/pyspark/sql/avro/functions.py +++ b/python/pyspark/sql/avro/functions.py @@ -25,7 +25,7 @@ from pyspark.util import _print_missing_jar -def from_avro(data, jsonFormatSchema, options={}): +def from_avro(data, jsonFormatSchema, options=None): """ Converts a binary column of Avro format into its corresponding catalyst value. The specified schema must match the read data, otherwise the behavior is undefined: @@ -70,7 +70,7 @@ def from_avro(data, jsonFormatSchema, options={}): sc = SparkContext._active_spark_context try: jc = sc._jvm.org.apache.spark.sql.avro.functions.from_avro( - _to_java_column(data), jsonFormatSchema, options) + _to_java_column(data), jsonFormatSchema, options or {}) except TypeError as e: if str(e) == "'JavaPackage' object is not callable": _print_missing_jar("Avro", "avro", "avro", sc.version) diff --git a/python/pyspark/sql/avro/functions.pyi b/python/pyspark/sql/avro/functions.pyi index 4c2e3814a9e94..49881335d8fcc 100644 --- a/python/pyspark/sql/avro/functions.pyi +++ b/python/pyspark/sql/avro/functions.pyi @@ -16,12 +16,12 @@ # specific language governing permissions and limitations # under the License. -from typing import Dict +from typing import Dict, Optional from pyspark.sql._typing import ColumnOrName from pyspark.sql.column import Column def from_avro( - data: ColumnOrName, jsonFormatSchema: str, options: Dict[str, str] = ... + data: ColumnOrName, jsonFormatSchema: str, options: Optional[Dict[str, str]] = ... ) -> Column: ... def to_avro(data: ColumnOrName, jsonFormatSchema: str = ...) -> Column: ... diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 4dc3129fd6bc2..f612d2d0366f2 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -80,8 +80,10 @@ def _invoke_binary_math_function(name, col1, col2): ) -def _options_to_str(options): - return {key: to_str(value) for (key, value) in options.items()} +def _options_to_str(options=None): + if options: + return {key: to_str(value) for (key, value) in options.items()} + return {} def lit(col): @@ -3454,7 +3456,7 @@ def json_tuple(col, *fields): return Column(jc) -def from_json(col, schema, options={}): +def from_json(col, schema, options=None): """ Parses a column containing a JSON string into a :class:`MapType` with :class:`StringType` as keys type, :class:`StructType` or :class:`ArrayType` with @@ -3510,7 +3512,7 @@ def from_json(col, schema, options={}): return Column(jc) -def to_json(col, options={}): +def to_json(col, options=None): """ Converts a column containing a :class:`StructType`, :class:`ArrayType` or a :class:`MapType` into a JSON string. Throws an exception, in the case of an unsupported type. @@ -3557,7 +3559,7 @@ def to_json(col, options={}): return Column(jc) -def schema_of_json(json, options={}): +def schema_of_json(json, options=None): """ Parses a JSON string and infers its schema in DDL format. @@ -3594,7 +3596,7 @@ def schema_of_json(json, options={}): return Column(jc) -def schema_of_csv(csv, options={}): +def schema_of_csv(csv, options=None): """ Parses a CSV string and infers its schema in DDL format. @@ -3627,7 +3629,7 @@ def schema_of_csv(csv, options={}): return Column(jc) -def to_csv(col, options={}): +def to_csv(col, options=None): """ Converts a column containing a :class:`StructType` into a CSV string. Throws an exception, in the case of an unsupported type. @@ -4038,7 +4040,7 @@ def sequence(start, stop, step=None): _to_java_column(start), _to_java_column(stop), _to_java_column(step))) -def from_csv(col, schema, options={}): +def from_csv(col, schema, options=None): """ Parses a column containing a CSV string to a row with the specified schema. Returns `null`, in the case of an unparseable string. diff --git a/python/pyspark/sql/functions.pyi b/python/pyspark/sql/functions.pyi index 50e178df9996f..acb17a2657d00 100644 --- a/python/pyspark/sql/functions.pyi +++ b/python/pyspark/sql/functions.pyi @@ -196,12 +196,12 @@ def json_tuple(col: ColumnOrName, *fields: str) -> Column: ... def from_json( col: ColumnOrName, schema: Union[ArrayType, StructType, Column, str], - options: Dict[str, str] = ..., + options: Optional[Dict[str, str]] = ..., ) -> Column: ... -def to_json(col: ColumnOrName, options: Dict[str, str] = ...) -> Column: ... -def schema_of_json(json: ColumnOrName, options: Dict[str, str] = ...) -> Column: ... -def schema_of_csv(csv: ColumnOrName, options: Dict[str, str] = ...) -> Column: ... -def to_csv(col: ColumnOrName, options: Dict[str, str] = ...) -> Column: ... +def to_json(col: ColumnOrName, options: Optional[Dict[str, str]] = ...) -> Column: ... +def schema_of_json(json: ColumnOrName, options: Optional[Dict[str, str]] = ...) -> Column: ... +def schema_of_csv(csv: ColumnOrName, options: Optional[Dict[str, str]] = ...) -> Column: ... +def to_csv(col: ColumnOrName, options: Optional[Dict[str, str]] = ...) -> Column: ... def size(col: ColumnOrName) -> Column: ... def array_min(col: ColumnOrName) -> Column: ... def array_max(col: ColumnOrName) -> Column: ... @@ -223,7 +223,7 @@ def sequence( def from_csv( col: ColumnOrName, schema: Union[StructType, Column, str], - options: Dict[str, str] = ..., + options: Optional[Dict[str, str]] = ..., ) -> Column: ... @overload def transform(col: ColumnOrName, f: Callable[[Column], Column]) -> Column: ... From b2a79306ef7b330c5bf4dc1337ed80ebd6e08d0c Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Mon, 7 Dec 2020 18:59:15 -0800 Subject: [PATCH 136/150] [SPARK-33680][SQL][TESTS][FOLLOWUP] Fix more test suites to have explicit confs ### What changes were proposed in this pull request? This is a follow-up for SPARK-33680 to remove the assumption on the default value of `spark.sql.adaptive.enabled` . ### Why are the changes needed? According to the test result https://github.com/apache/spark/pull/30628#issuecomment-739866168, the [previous run](https://github.com/apache/spark/pull/30628#issuecomment-739641105) didn't run all tests. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass the CIs. Closes #30655 from dongjoon-hyun/SPARK-33680. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun --- .../spark/sql/DataFrameAggregateSuite.scala | 4 +- .../apache/spark/sql/DataFrameJoinSuite.scala | 4 +- .../org/apache/spark/sql/JoinSuite.scala | 9 ++- .../spark/sql/execution/PlannerSuite.scala | 73 +++++++++++++------ .../spark/sql/sources/BucketedReadSuite.scala | 5 +- .../SqlResourceWithActualMetricsSuite.scala | 11 ++- 6 files changed, 74 insertions(+), 32 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala index d4e64aa03df0e..78983a4bd1a29 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala @@ -1001,7 +1001,9 @@ class DataFrameAggregateSuite extends QueryTest Seq(true, false).foreach { value => test(s"SPARK-31620: agg with subquery (whole-stage-codegen = $value)") { - withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> value.toString) { + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> value.toString, + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { withTempView("t1", "t2") { sql("create temporary view t1 as select * from values (1, 2) as t1(a, b)") sql("create temporary view t2 as select * from values (3, 4) as t2(c, d)") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala index 14d03a30453ac..c317f562c65dc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala @@ -335,7 +335,9 @@ class DataFrameJoinSuite extends QueryTest withTempDatabase { dbName => withTable(table1Name, table2Name) { - withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withSQLConf( + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { spark.range(50).write.saveAsTable(s"$dbName.$table1Name") spark.range(100).write.saveAsTable(s"$dbName.$table2Name") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala index 8755dccb801c2..a728e5cc17001 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala @@ -1107,6 +1107,7 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan test("SPARK-32330: Preserve shuffled hash join build side partitioning") { withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "50", SQLConf.SHUFFLE_PARTITIONS.key -> "2", SQLConf.PREFER_SORTMERGEJOIN.key -> "false") { @@ -1130,6 +1131,7 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan // Test broadcast hash join withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "50") { Seq("inner", "left_outer").foreach(joinType => { val plan = df1.join(df2, $"k1" === $"k2", joinType) @@ -1146,6 +1148,7 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan // Test shuffled hash join withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "50", SQLConf.SHUFFLE_PARTITIONS.key -> "2", SQLConf.PREFER_SORTMERGEJOIN.key -> "false") { @@ -1253,6 +1256,7 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan withSQLConf( // Set broadcast join threshold and number of shuffle partitions, // as shuffled hash join depends on these two configs. + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "80", SQLConf.SHUFFLE_PARTITIONS.key -> "2") { val smjDF = df1.join(df2, joinExprs, "full") @@ -1284,7 +1288,9 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan ) inputDFs.foreach { case (df1, df2, joinType) => // Test broadcast hash join - withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "200") { + withSQLConf( + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "200", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { val bhjCodegenDF = df1.join(df2, $"k1" === $"k2", joinType) assert(bhjCodegenDF.queryExecution.executedPlan.collect { case WholeStageCodegenExec(_ : BroadcastHashJoinExec) => true @@ -1305,6 +1311,7 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan // Set broadcast join threshold and number of shuffle partitions, // as shuffled hash join depends on these two configs. SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "50", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", SQLConf.SHUFFLE_PARTITIONS.key -> "2") { val shjCodegenDF = df1.join(df2, $"k1" === $"k2", joinType) assert(shjCodegenDF.queryExecution.executedPlan.collect { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala index 5e30f846307ae..4e01d1c06f64e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala @@ -877,7 +877,9 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { } test("aliases in the project should not introduce extra shuffle") { - withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withSQLConf( + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { withTempView("df1", "df2") { spark.range(10).selectExpr("id AS key", "0").repartition($"key").createTempView("df1") spark.range(20).selectExpr("id AS key", "0").repartition($"key").createTempView("df2") @@ -897,7 +899,9 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { test("SPARK-33399: aliases should be handled properly in PartitioningCollection output" + " partitioning") { - withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withSQLConf( + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { withTempView("t1", "t2", "t3") { spark.range(10).repartition($"id").createTempView("t1") spark.range(20).repartition($"id").createTempView("t2") @@ -927,7 +931,9 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { } test("SPARK-33399: aliases should be handled properly in HashPartitioning") { - withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withSQLConf( + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { withTempView("t1", "t2", "t3") { spark.range(10).repartition($"id").createTempView("t1") spark.range(20).repartition($"id").createTempView("t2") @@ -955,7 +961,9 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { } test("SPARK-33399: alias handling should happen properly for RangePartitioning") { - withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withSQLConf( + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { val df = spark.range(1, 100) .select(col("id").as("id1")).groupBy("id1").count() // Plan for this will be Range -> ProjectWithAlias -> HashAggregate -> HashAggregate @@ -976,7 +984,9 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { test("SPARK-33399: aliased should be handled properly " + "for partitioning and sortorder involving complex expressions") { - withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withSQLConf( + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { withTempView("t1", "t2", "t3") { spark.range(10).select(col("id").as("id1")).createTempView("t1") spark.range(20).select(col("id").as("id2")).createTempView("t2") @@ -1014,7 +1024,9 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { } test("SPARK-33399: alias handling should happen properly for SinglePartition") { - withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withSQLConf( + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { val df = spark.range(1, 100, 1, 1) .select(col("id").as("id1")).groupBy("id1").count() val planned = df.queryExecution.executedPlan @@ -1031,7 +1043,9 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { test("SPARK-33399: No extra exchanges in case of" + " [Inner Join -> Project with aliases -> HashAggregate]") { - withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withSQLConf( + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { withTempView("t1", "t2") { spark.range(10).repartition($"id").createTempView("t1") spark.range(20).repartition($"id").createTempView("t2") @@ -1060,7 +1074,9 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { } test("SPARK-33400: Normalization of sortOrder should take care of sameOrderExprs") { - withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withSQLConf( + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { withTempView("t1", "t2", "t3") { spark.range(10).repartition($"id").createTempView("t1") spark.range(20).repartition($"id").createTempView("t2") @@ -1091,7 +1107,9 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { } test("sort order doesn't have repeated expressions") { - withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withSQLConf( + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { withTempView("t1", "t2") { spark.range(10).repartition($"id").createTempView("t1") spark.range(20).repartition($"id").createTempView("t2") @@ -1117,7 +1135,9 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { } test("aliases to expressions should not be replaced") { - withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withSQLConf( + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { withTempView("df1", "df2") { spark.range(10).selectExpr("id AS key", "0").repartition($"key").createTempView("df1") spark.range(20).selectExpr("id AS key", "0").repartition($"key").createTempView("df2") @@ -1143,7 +1163,9 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { } test("aliases in the aggregate expressions should not introduce extra shuffle") { - withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withSQLConf( + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { val t1 = spark.range(10).selectExpr("floor(id/4) as k1") val t2 = spark.range(20).selectExpr("floor(id/4) as k2") @@ -1160,7 +1182,9 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { } test("aliases in the object hash/sort aggregate expressions should not introduce extra shuffle") { - withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withSQLConf( + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { Seq(true, false).foreach { useObjectHashAgg => withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> useObjectHashAgg.toString) { val t1 = spark.range(10).selectExpr("floor(id/4) as k1") @@ -1185,21 +1209,22 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { } test("aliases in the sort aggregate expressions should not introduce extra sort") { - withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { - withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") { - val t1 = spark.range(10).selectExpr("floor(id/4) as k1") - val t2 = spark.range(20).selectExpr("floor(id/4) as k2") + withSQLConf( + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", + SQLConf.USE_OBJECT_HASH_AGG.key -> "false") { + val t1 = spark.range(10).selectExpr("floor(id/4) as k1") + val t2 = spark.range(20).selectExpr("floor(id/4) as k2") - val agg1 = t1.groupBy("k1").agg(collect_list("k1")).withColumnRenamed("k1", "k3") - val agg2 = t2.groupBy("k2").agg(collect_list("k2")) + val agg1 = t1.groupBy("k1").agg(collect_list("k1")).withColumnRenamed("k1", "k3") + val agg2 = t2.groupBy("k2").agg(collect_list("k2")) - val planned = agg1.join(agg2, $"k3" === $"k2").queryExecution.executedPlan - assert(planned.collect { case s: SortAggregateExec => s }.nonEmpty) + val planned = agg1.join(agg2, $"k3" === $"k2").queryExecution.executedPlan + assert(planned.collect { case s: SortAggregateExec => s }.nonEmpty) - // We expect two SortExec nodes on each side of join. - val sorts = planned.collect { case s: SortExec => s } - assert(sorts.size == 4) - } + // We expect two SortExec nodes on each side of join. + val sorts = planned.collect { case s: SortExec => s } + assert(sorts.size == 4) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala index 167e87dd3d5cb..0ff9303421ade 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning import org.apache.spark.sql.execution.{FileSourceScanExec, SortExec, SparkPlan} -import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec +import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, DisableAdaptiveExecutionSuite} import org.apache.spark.sql.execution.datasources.BucketingUtils import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec import org.apache.spark.sql.execution.joins.SortMergeJoinExec @@ -39,7 +39,8 @@ import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} import org.apache.spark.util.Utils import org.apache.spark.util.collection.BitSet -class BucketedReadWithoutHiveSupportSuite extends BucketedReadSuite with SharedSparkSession { +class BucketedReadWithoutHiveSupportSuite + extends BucketedReadSuite with DisableAdaptiveExecutionSuite with SharedSparkSession { protected override def beforeAll(): Unit = { super.beforeAll() assert(spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "in-memory") diff --git a/sql/core/src/test/scala/org/apache/spark/status/api/v1/sql/SqlResourceWithActualMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/status/api/v1/sql/SqlResourceWithActualMetricsSuite.scala index 0c0e3ac90510e..1510e8957f9ae 100644 --- a/sql/core/src/test/scala/org/apache/spark/status/api/v1/sql/SqlResourceWithActualMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/status/api/v1/sql/SqlResourceWithActualMetricsSuite.scala @@ -26,7 +26,9 @@ import org.json4s.jackson.JsonMethods import org.apache.spark.SparkConf import org.apache.spark.deploy.history.HistoryServerSuite.getContentAndCode import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.execution.metric.SQLMetricsTestUtils +import org.apache.spark.sql.internal.SQLConf.ADAPTIVE_EXECUTION_ENABLED import org.apache.spark.sql.test.SharedSparkSession case class Person(id: Int, name: String, age: Int) @@ -35,7 +37,8 @@ case class Salary(personId: Int, salary: Double) /** * Sql Resource Public API Unit Tests running query and extracting the metrics. */ -class SqlResourceWithActualMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils { +class SqlResourceWithActualMetricsSuite + extends SharedSparkSession with SQLMetricsTestUtils with SQLHelper { import testImplicits._ @@ -52,8 +55,10 @@ class SqlResourceWithActualMetricsSuite extends SharedSparkSession with SQLMetri test("Check Sql Rest Api Endpoints") { // Materalize result DataFrame - val count = getDF().count() - assert(count == 2, s"Expected Query Count is 2 but received: $count") + withSQLConf(ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + val count = getDF().count() + assert(count == 2, s"Expected Query Count is 2 but received: $count") + } // Spark apps launched by local-mode seems not having `attemptId` as default // so UT is just added for existing endpoints. From ebd8b9357af296b8859e65577ab1e16593fab50d Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Tue, 8 Dec 2020 11:04:29 +0800 Subject: [PATCH 137/150] [SPARK-33609][ML] word2vec reduce broadcast size ### What changes were proposed in this pull request? 1, directly use float vectors instead of converting to double vectors, this is about 2x faster than using vec.axpy; 2, mark `wordList` and `wordVecNorms` lazy 3, avoid slicing in computation of `wordVecNorms` ### Why are the changes needed? halve broadcast size ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? existing testsuites Closes #30548 from zhengruifeng/w2v_float32_transform. Lead-authored-by: Ruifeng Zheng Co-authored-by: zhengruifeng Signed-off-by: Ruifeng Zheng --- .../apache/spark/ml/feature/Word2Vec.scala | 32 +++++++++++-------- .../apache/spark/mllib/feature/Word2Vec.scala | 27 +++++++--------- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index 9b5f5a619e02c..0b9c1b570d943 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -285,27 +285,33 @@ class Word2VecModel private[ml] ( @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) - val vectors = wordVectors.getVectors - .mapValues(vv => Vectors.dense(vv.map(_.toDouble))) - .map(identity).toMap // mapValues doesn't return a serializable map (SI-7005) - val bVectors = dataset.sparkSession.sparkContext.broadcast(vectors) - val d = $(vectorSize) - val emptyVec = Vectors.sparse(d, Array.emptyIntArray, Array.emptyDoubleArray) - val word2Vec = udf { sentence: Seq[String] => + + val bcModel = dataset.sparkSession.sparkContext.broadcast(this.wordVectors) + val size = $(vectorSize) + val emptyVec = Vectors.sparse(size, Array.emptyIntArray, Array.emptyDoubleArray) + val transformer = udf { sentence: Seq[String] => if (sentence.isEmpty) { emptyVec } else { - val sum = Vectors.zeros(d) + val wordIndices = bcModel.value.wordIndex + val wordVectors = bcModel.value.wordVectors + val array = Array.ofDim[Double](size) + var count = 0 sentence.foreach { word => - bVectors.value.get(word).foreach { v => - BLAS.axpy(1.0, v, sum) + wordIndices.get(word).foreach { index => + val offset = index * size + var i = 0 + while (i < size) { array(i) += wordVectors(offset + i); i += 1 } } + count += 1 } - BLAS.scal(1.0 / sentence.size, sum) - sum + val vec = Vectors.dense(array) + BLAS.scal(1.0 / count, vec) + vec } } - dataset.withColumn($(outputCol), word2Vec(col($(inputCol))), + + dataset.withColumn($(outputCol), transformer(col($(inputCol))), outputSchema($(outputCol)).metadata) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index eeb583f84ca8b..8a6317a910146 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -502,22 +502,15 @@ class Word2VecModel private[spark] ( private val vectorSize = wordVectors.length / numWords // wordList: Ordered list of words obtained from wordIndex. - private val wordList: Array[String] = { - val (wl, _) = wordIndex.toSeq.sortBy(_._2).unzip - wl.toArray + private lazy val wordList: Array[String] = { + wordIndex.toSeq.sortBy(_._2).iterator.map(_._1).toArray } // wordVecNorms: Array of length numWords, each value being the Euclidean norm // of the wordVector. - private val wordVecNorms: Array[Float] = { - val wordVecNorms = new Array[Float](numWords) - var i = 0 - while (i < numWords) { - val vec = wordVectors.slice(i * vectorSize, i * vectorSize + vectorSize) - wordVecNorms(i) = blas.snrm2(vectorSize, vec, 1) - i += 1 - } - wordVecNorms + private lazy val wordVecNorms: Array[Float] = { + val size = vectorSize + Array.tabulate(numWords)(i => blas.snrm2(size, wordVectors, i * size, 1)) } @Since("1.5.0") @@ -538,9 +531,13 @@ class Word2VecModel private[spark] ( @Since("1.1.0") def transform(word: String): Vector = { wordIndex.get(word) match { - case Some(ind) => - val vec = wordVectors.slice(ind * vectorSize, ind * vectorSize + vectorSize) - Vectors.dense(vec.map(_.toDouble)) + case Some(index) => + val size = vectorSize + val offset = index * size + val array = Array.ofDim[Double](size) + var i = 0 + while (i < size) { array(i) = wordVectors(offset + i); i += 1 } + Vectors.dense(array) case None => throw new IllegalStateException(s"$word not in vocabulary") } From 8bcebfa59a64123f014c01bc4fb5de8d9624f8f4 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Mon, 7 Dec 2020 19:09:59 -0800 Subject: [PATCH 138/150] [SPARK-33698][BUILD][TESTS] Fix the build error of OracleIntegrationSuite for Scala 2.13 ### What changes were proposed in this pull request? This PR fixes a build error of `OracleIntegrationSuite` with Scala 2.13. ### Why are the changes needed? Build should pass with Scala 2.13. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? I confirmed that the build pass with the following command. ``` $ build/sbt -Pdocker-integration-tests -Pscala-2.13 "docker-integration-tests/test:compile" ``` Closes #30660 from sarutak/fix-docker-integration-tests-for-scala-2.13. Authored-by: Kousuke Saruta Signed-off-by: Dongjoon Hyun --- .../org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala index 60eb1c055a38e..3937d62afacc2 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala +++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala @@ -401,7 +401,7 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSpark val values = rows(0) assert(values.getDecimal(0).equals(new java.math.BigDecimal("12312321321321312312312312123"))) assert(values.getInt(1).equals(1)) - assert(values.getBoolean(2).equals(false)) + assert(values.getBoolean(2) == false) } test("SPARK-22303: handle BINARY_DOUBLE and BINARY_FLOAT as DoubleType and FloatType") { From 5aefc49b0f7047f2c928c18b371098314c2f59f0 Mon Sep 17 00:00:00 2001 From: Terry Kim Date: Tue, 8 Dec 2020 03:54:16 +0000 Subject: [PATCH 139/150] [SPARK-33664][SQL] Migrate ALTER TABLE ... RENAME TO to use UnresolvedTableOrView to resolve identifier ### What changes were proposed in this pull request? This PR proposes to migrate `ALTER [TABLE|ViEW] ... RENAME TO` to use `UnresolvedTableOrView` to resolve the table/view identifier. This allows consistent resolution rules (temp view first, etc.) to be applied for both v1/v2 commands. More info about the consistent resolution rule proposal can be found in [JIRA](https://issues.apache.org/jira/browse/SPARK-29900) or [proposal doc](https://docs.google.com/document/d/1hvLjGA8y_W_hhilpngXVub1Ebv8RsMap986nENCFnrg/edit?usp=sharing). ### Why are the changes needed? To use `UnresolvedTableOrView` for table/view resolution. Note that `AlterTableRenameCommand` internally resolves to a temp view first, so there is no resolution behavior change with this PR. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Updated existing tests. Closes #30610 from imback82/rename_v2. Authored-by: Terry Kim Signed-off-by: Wenchen Fan --- .../sql/catalyst/analysis/ResolveCatalogs.scala | 6 ------ .../spark/sql/catalyst/parser/AstBuilder.scala | 12 ++++++++---- .../sql/catalyst/plans/logical/statements.scala | 8 -------- .../sql/catalyst/plans/logical/v2Commands.scala | 10 ++++++---- .../sql/catalyst/parser/DDLParserSuite.scala | 10 ++++++++-- .../analysis/ResolveSessionCatalog.scala | 3 +-- .../datasources/v2/DataSourceV2Strategy.scala | 8 ++++++-- .../sql/connector/DataSourceV2SQLSuite.scala | 13 ++++++++++--- .../v2/jdbc/JDBCTableCatalogSuite.scala | 16 +++++++--------- 9 files changed, 46 insertions(+), 40 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala index a90de697bc084..6d89414ba106d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala @@ -133,12 +133,6 @@ class ResolveCatalogs(val catalogManager: CatalogManager) s"Can not specify catalog `${catalog.name}` for view ${tbl.quoted} " + s"because view support in catalog has not been implemented yet") - case RenameTableStatement(NonSessionCatalogAndTable(catalog, oldName), newNameParts, isView) => - if (isView) { - throw new AnalysisException("Renaming view is not supported in v2 catalogs.") - } - RenameTable(catalog.asTableCatalog, oldName.asIdentifier, newNameParts.asIdentifier) - case c @ CreateTableStatement( NonSessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _, _, _) => assertNoNullTypeInSchema(c.tableSchema) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index a22383c62bf74..42c67ac963cbe 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -3834,7 +3834,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Create a [[RenameTableStatement]] command. + * Create a [[RenameTable]] command. * * For example: * {{{ @@ -3843,10 +3843,14 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg * }}} */ override def visitRenameTable(ctx: RenameTableContext): LogicalPlan = withOrigin(ctx) { - RenameTableStatement( - visitMultipartIdentifier(ctx.from), + val isView = ctx.VIEW != null + val relationStr = if (isView) "VIEW" else "TABLE" + RenameTable( + UnresolvedTableOrView( + visitMultipartIdentifier(ctx.from), + s"ALTER $relationStr ... RENAME TO"), visitMultipartIdentifier(ctx.to), - ctx.VIEW != null) + isView) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala index 1763547792e35..8f0889bbcebd8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala @@ -338,14 +338,6 @@ case class AlterViewAsStatement( originalText: String, query: LogicalPlan) extends ParsedStatement -/** - * ALTER TABLE ... RENAME TO command, as parsed from SQL. - */ -case class RenameTableStatement( - oldName: Seq[String], - newName: Seq[String], - isView: Boolean) extends ParsedStatement - /** * A DROP VIEW statement, as parsed from SQL. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala index 67056470418fe..6f35364cce131 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala @@ -456,12 +456,14 @@ case class AlterTable( } /** - * The logical plan of the ALTER TABLE RENAME command. + * The logical plan of the ALTER [TABLE|VIEW] ... RENAME TO command. */ case class RenameTable( - catalog: TableCatalog, - oldIdent: Identifier, - newIdent: Identifier) extends Command + child: LogicalPlan, + newName: Seq[String], + isView: Boolean) extends Command { + override def children: Seq[LogicalPlan] = child :: Nil +} /** * The logical plan of the SHOW TABLE command. diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala index e98ec6a667a73..f925be8617b47 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala @@ -1103,10 +1103,16 @@ class DDLParserSuite extends AnalysisTest { test("alter table/view: rename table/view") { comparePlans( parsePlan("ALTER TABLE a.b.c RENAME TO x.y.z"), - RenameTableStatement(Seq("a", "b", "c"), Seq("x", "y", "z"), isView = false)) + RenameTable( + UnresolvedTableOrView(Seq("a", "b", "c"), "ALTER TABLE ... RENAME TO"), + Seq("x", "y", "z"), + isView = false)) comparePlans( parsePlan("ALTER VIEW a.b.c RENAME TO x.y.z"), - RenameTableStatement(Seq("a", "b", "c"), Seq("x", "y", "z"), isView = true)) + RenameTable( + UnresolvedTableOrView(Seq("a", "b", "c"), "ALTER VIEW ... RENAME TO"), + Seq("x", "y", "z"), + isView = true)) } test("describe table column") { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index a87ed4b6275d8..7e5f39e398a6b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -237,8 +237,7 @@ class ResolveSessionCatalog( } AlterDatabaseSetLocationCommand(ns.head, location) - // v1 RENAME TABLE supports temp view. - case RenameTableStatement(TempViewOrV1Table(oldName), newName, isView) => + case RenameTable(ResolvedV1TableOrViewIdentifier(oldName), newName, isView) => AlterTableRenameCommand(oldName.asTableIdentifier, newName.asTableIdentifier, isView) // Use v1 command to describe (temp) view, as v2 catalog doesn't support view yet. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index 5289d359f7809..075d2a43dce4e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -257,8 +257,12 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat case AlterTable(catalog, ident, _, changes) => AlterTableExec(catalog, ident, changes) :: Nil - case RenameTable(catalog, oldIdent, newIdent) => - RenameTableExec(catalog, oldIdent, newIdent) :: Nil + case RenameTable(ResolvedTable(catalog, oldIdent, _), newIdent, isView) => + if (isView) { + throw new AnalysisException( + "Cannot rename a table with ALTER VIEW. Please use ALTER TABLE instead.") + } + RenameTableExec(catalog, oldIdent, newIdent.asIdentifier) :: Nil case AlterNamespaceSetProperties(ResolvedNamespace(catalog, ns), properties) => AlterNamespaceSetPropertiesExec(catalog.asNamespaceCatalog, ns, properties) :: Nil diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index 6838a7644a29f..2673577aecf36 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -1975,10 +1975,16 @@ class DataSourceV2SQLSuite test("AlterTable: rename table basic test") { withTable("testcat.ns1.new") { - sql(s"CREATE TABLE testcat.ns1.ns2.old USING foo AS SELECT id, data FROM source") + sql("CREATE TABLE testcat.ns1.ns2.old USING foo AS SELECT id, data FROM source") checkAnswer(sql("SHOW TABLES FROM testcat.ns1.ns2"), Seq(Row("ns1.ns2", "old"))) - sql(s"ALTER TABLE testcat.ns1.ns2.old RENAME TO ns1.new") + val e = intercept[AnalysisException] { + sql("ALTER VIEW testcat.ns1.ns2.old RENAME TO ns1.new") + } + assert(e.getMessage.contains( + "Cannot rename a table with ALTER VIEW. Please use ALTER TABLE instead")) + + sql("ALTER TABLE testcat.ns1.ns2.old RENAME TO ns1.new") checkAnswer(sql("SHOW TABLES FROM testcat.ns1.ns2"), Seq.empty) checkAnswer(sql("SHOW TABLES FROM testcat.ns1"), Seq(Row("ns1", "new"))) } @@ -1988,7 +1994,8 @@ class DataSourceV2SQLSuite val e = intercept[AnalysisException] { sql(s"ALTER VIEW testcat.ns.tbl RENAME TO ns.view") } - assert(e.getMessage.contains("Renaming view is not supported in v2 catalogs")) + assert(e.getMessage.contains( + "Table or view not found for 'ALTER VIEW ... RENAME TO': testcat.ns.tbl")) } test("ANALYZE TABLE") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala index 9e9df7db1e1c6..e764f71867426 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTableCatalogSuite.scala @@ -23,7 +23,7 @@ import org.apache.log4j.Level import org.apache.spark.SparkConf import org.apache.spark.sql.{AnalysisException, QueryTest, Row} -import org.apache.spark.sql.catalyst.analysis.{NoSuchNamespaceException, NoSuchTableException, TableAlreadyExistsException} +import org.apache.spark.sql.catalyst.analysis.{NoSuchNamespaceException, TableAlreadyExistsException} import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -106,18 +106,16 @@ class JDBCTableCatalogSuite extends QueryTest with SharedSparkSession { Seq(Row("test", "dst_table"), Row("test", "people"))) } // Rename not existing table or namespace - val exp1 = intercept[NoSuchTableException] { - sql(s"ALTER TABLE h2.test.not_existing_table RENAME TO test.dst_table") + val exp1 = intercept[AnalysisException] { + sql("ALTER TABLE h2.test.not_existing_table RENAME TO test.dst_table") } assert(exp1.getMessage.contains( - "Failed table renaming from test.not_existing_table to test.dst_table")) - assert(exp1.cause.get.getMessage.contains("Table \"not_existing_table\" not found")) - val exp2 = intercept[NoSuchNamespaceException] { - sql(s"ALTER TABLE h2.bad_test.not_existing_table RENAME TO test.dst_table") + "Table or view not found for 'ALTER TABLE ... RENAME TO': h2.test.not_existing_table")) + val exp2 = intercept[AnalysisException] { + sql("ALTER TABLE h2.bad_test.not_existing_table RENAME TO test.dst_table") } assert(exp2.getMessage.contains( - "Failed table renaming from bad_test.not_existing_table to test.dst_table")) - assert(exp2.cause.get.getMessage.contains("Schema \"bad_test\" not found")) + "Table or view not found for 'ALTER TABLE ... RENAME TO': h2.bad_test.not_existing_table")) // Rename to an existing table withTable("h2.test.dst_table") { withConnection { conn => From 3a6546d3858e7c184f36cb6c4fd454f2142460f0 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Tue, 8 Dec 2020 14:11:39 +0900 Subject: [PATCH 140/150] [MINOR][INFRA] Add -Pdocker-integration-tests to GitHub Action Scala 2.13 build job ### What changes were proposed in this pull request? This aims to add `-Pdocker-integration-tests` at GitHub Action job for Scala 2.13 compilation. ### Why are the changes needed? We fixed Scala 2.13 compilation of this module at https://github.com/apache/spark/pull/30660 . This PR will prevent accidental regression at that module. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass GitHub Action Scala 2.13 job. Closes #30661 from dongjoon-hyun/SPARK-DOCKER-IT. Authored-by: Dongjoon Hyun Signed-off-by: Kousuke Saruta --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 72b2caf907151..e40d6362fd23f 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -414,7 +414,7 @@ jobs: - name: Build with SBT run: | ./dev/change-scala-version.sh 2.13 - ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pscala-2.13 compile test:compile + ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pscala-2.13 compile test:compile hadoop-2: name: Hadoop 2 build with SBT From 031c5ef280e0cba8c4718a6457a44b6cccb17f46 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Mon, 7 Dec 2020 23:10:35 -0800 Subject: [PATCH 141/150] [SPARK-33679][SQL] Enable spark.sql.adaptive.enabled by default ### What changes were proposed in this pull request? This PR aims to enable `spark.sql.adaptive.enabled` by default for Apache Spark **3.2.0**. ### Why are the changes needed? By switching the default for Apache Spark 3.2, the whole community can focus more on the stabilizing this feature in the various situation more seriously. ### Does this PR introduce _any_ user-facing change? Yes, but this is an improvement and it's supposed to have no bugs. ### How was this patch tested? Pass the CIs. Closes #30628 from dongjoon-hyun/SPARK-33679. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun --- docs/sql-migration-guide.md | 4 ++++ .../main/scala/org/apache/spark/sql/internal/SQLConf.scala | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 2c86e7a932637..65a769da70aea 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -22,6 +22,10 @@ license: | * Table of contents {:toc} +## Upgrading from Spark SQL 3.1 to 3.2 + + - In Spark 3.2, `spark.sql.adaptive.enabled` is enabled by default. To restore the behavior before Spark 3.2, you can set `spark.sql.adaptive.enabled` to `false`. + ## Upgrading from Spark SQL 3.0 to 3.1 - In Spark 3.1, statistical aggregation function includes `std`, `stddev`, `stddev_samp`, `variance`, `var_samp`, `skewness`, `kurtosis`, `covar_samp`, `corr` will return `NULL` instead of `Double.NaN` when `DivideByZero` occurs during expression evaluation, for example, when `stddev_samp` applied on a single element set. In Spark version 3.0 and earlier, it will return `Double.NaN` in such case. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.statisticalAggregate` to `true`. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index bc62213bdb740..11fe6c7894f76 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -404,7 +404,7 @@ object SQLConf { "middle of query execution, based on accurate runtime statistics.") .version("1.6.0") .booleanConf - .createWithDefault(false) + .createWithDefault(true) val ADAPTIVE_EXECUTION_FORCE_APPLY = buildConf("spark.sql.adaptive.forceApply") .internal() From 99613cd5815b2de12274027dee0c0a6c0c57bd95 Mon Sep 17 00:00:00 2001 From: luluorta Date: Tue, 8 Dec 2020 20:45:25 +0900 Subject: [PATCH 142/150] [SPARK-33677][SQL] Skip LikeSimplification rule if pattern contains any escapeChar ### What changes were proposed in this pull request? `LikeSimplification` rule does not work correctly for many cases that have patterns containing escape characters, for example: `SELECT s LIKE 'm%aca' ESCAPE '%' FROM t` `SELECT s LIKE 'maacaa' ESCAPE 'a' FROM t` For simpilicy, this PR makes this rule just be skipped if `pattern` contains any `escapeChar`. ### Why are the changes needed? Result corrupt. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Added Unit test. Closes #30625 from luluorta/SPARK-33677. Authored-by: luluorta Signed-off-by: Takeshi Yamamuro --- .../sql/catalyst/optimizer/expressions.scala | 18 ++++--- .../optimizer/LikeSimplificationSuite.scala | 48 +++++++++++++++++++ .../org/apache/spark/sql/SQLQuerySuite.scala | 14 ++++++ 3 files changed, 74 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala index 4cdaf10dd3c60..7666c4a53e5dd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala @@ -543,27 +543,33 @@ object LikeSimplification extends Rule[LogicalPlan] { private val equalTo = "([^_%]*)".r def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { - case Like(input, Literal(pattern, StringType), escapeChar) => + case l @ Like(input, Literal(pattern, StringType), escapeChar) => if (pattern == null) { // If pattern is null, return null value directly, since "col like null" == null. Literal(null, BooleanType) } else { - val escapeStr = String.valueOf(escapeChar) pattern.toString match { - case startsWith(prefix) if !prefix.endsWith(escapeStr) => + // There are three different situations when pattern containing escapeChar: + // 1. pattern contains invalid escape sequence, e.g. 'm\aca' + // 2. pattern contains escaped wildcard character, e.g. 'ma\%ca' + // 3. pattern contains escaped escape character, e.g. 'ma\\ca' + // Although there are patterns can be optimized if we handle the escape first, we just + // skip this rule if pattern contains any escapeChar for simplicity. + case p if p.contains(escapeChar) => l + case startsWith(prefix) => StartsWith(input, Literal(prefix)) case endsWith(postfix) => EndsWith(input, Literal(postfix)) // 'a%a' pattern is basically same with 'a%' && '%a'. // However, the additional `Length` condition is required to prevent 'a' match 'a%a'. - case startsAndEndsWith(prefix, postfix) if !prefix.endsWith(escapeStr) => + case startsAndEndsWith(prefix, postfix) => And(GreaterThanOrEqual(Length(input), Literal(prefix.length + postfix.length)), And(StartsWith(input, Literal(prefix)), EndsWith(input, Literal(postfix)))) - case contains(infix) if !infix.endsWith(escapeStr) => + case contains(infix) => Contains(input, Literal(infix)) case equalTo(str) => EqualTo(input, Literal(str)) - case _ => Like(input, Literal.create(pattern, StringType), escapeChar) + case _ => l } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala index 436f62e4225c8..1812dce0da426 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala @@ -116,4 +116,52 @@ class LikeSimplificationSuite extends PlanTest { val optimized2 = Optimize.execute(originalQuery2.analyze) comparePlans(optimized2, originalQuery2.analyze) } + + test("SPARK-33677: LikeSimplification should be skipped if pattern contains any escapeChar") { + val originalQuery1 = + testRelation + .where(('a like "abc%") || ('a like "\\abc%")) + val optimized1 = Optimize.execute(originalQuery1.analyze) + val correctAnswer1 = testRelation + .where(StartsWith('a, "abc") || ('a like "\\abc%")) + .analyze + comparePlans(optimized1, correctAnswer1) + + val originalQuery2 = + testRelation + .where(('a like "%xyz") || ('a like "%xyz\\")) + val optimized2 = Optimize.execute(originalQuery2.analyze) + val correctAnswer2 = testRelation + .where(EndsWith('a, "xyz") || ('a like "%xyz\\")) + .analyze + comparePlans(optimized2, correctAnswer2) + + val originalQuery3 = + testRelation + .where(('a like ("@bc%def", '@')) || ('a like "abc%def")) + val optimized3 = Optimize.execute(originalQuery3.analyze) + val correctAnswer3 = testRelation + .where(('a like ("@bc%def", '@')) || + (Length('a) >= 6 && (StartsWith('a, "abc") && EndsWith('a, "def")))) + .analyze + comparePlans(optimized3, correctAnswer3) + + val originalQuery4 = + testRelation + .where(('a like "%mn%") || ('a like ("%mn%", '%'))) + val optimized4 = Optimize.execute(originalQuery4.analyze) + val correctAnswer4 = testRelation + .where(Contains('a, "mn") || ('a like ("%mn%", '%'))) + .analyze + comparePlans(optimized4, correctAnswer4) + + val originalQuery5 = + testRelation + .where(('a like "abc") || ('a like ("abbc", 'b'))) + val optimized5 = Optimize.execute(originalQuery5.analyze) + val correctAnswer5 = testRelation + .where(('a === "abc") || ('a like ("abbc", 'b'))) + .analyze + comparePlans(optimized5, correctAnswer5) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 727482e551a8b..2eeb729ece3fb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -3718,6 +3718,20 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } } } + + test("SPARK-33677: LikeSimplification should be skipped if pattern contains any escapeChar") { + withTempView("df") { + Seq("m@ca").toDF("s").createOrReplaceTempView("df") + + val e = intercept[AnalysisException] { + sql("SELECT s LIKE 'm%@ca' ESCAPE '%' FROM df").collect() + } + assert(e.message.contains("the pattern 'm%@ca' is invalid, " + + "the escape character is not allowed to precede '@'")) + + checkAnswer(sql("SELECT s LIKE 'm@@ca' ESCAPE '@' FROM df"), Row(true)) + } + } } case class Foo(bar: Option[String]) From 2b30dde24972f7123b7ee14583fdce72e9ee955f Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Tue, 8 Dec 2020 12:08:22 +0000 Subject: [PATCH 143/150] [SPARK-33688][SQL] Migrate SHOW TABLE EXTENDED to new resolution framework ### What changes were proposed in this pull request? 1. Remove old statement `ShowTableStatement` 2. Introduce new command `ShowTableExtended` for `SHOW TABLE EXTENDED`. This PR is the first step of new V2 implementation of `SHOW TABLE EXTENDED`, see SPARK-33393. ### Why are the changes needed? This is a part of effort to make the relation lookup behavior consistent: SPARK-29900. ### Does this PR introduce _any_ user-facing change? The changes should not affect V1 tables. For V2, Spark outputs the error: ``` SHOW TABLE EXTENDED is not supported for v2 tables. ``` ### How was this patch tested? By running `SHOW TABLE EXTENDED` tests: ``` $ build/sbt -Phive-2.3 -Phive-thriftserver "test:testOnly *ShowTablesSuite" ``` Closes #30645 from MaxGekk/show-table-extended-statement. Authored-by: Max Gekk Signed-off-by: Wenchen Fan --- .../spark/sql/catalyst/parser/SqlBase.g4 | 2 +- .../sql/catalyst/analysis/Analyzer.scala | 2 ++ .../sql/catalyst/parser/AstBuilder.scala | 15 +++++++---- .../catalyst/plans/logical/statements.scala | 9 ------- .../catalyst/plans/logical/v2Commands.scala | 20 ++++++++++++-- .../analysis/ResolveSessionCatalog.scala | 20 +++++++++----- .../datasources/v2/DataSourceV2Strategy.scala | 3 +++ .../command/ShowTablesParserSuite.scala | 27 ++++++++++++------- .../command/v2/ShowTablesSuite.scala | 7 +++-- 9 files changed, 67 insertions(+), 38 deletions(-) diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index a23994f456f75..b08451d8a6cfa 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -198,7 +198,7 @@ statement | SHOW TABLES ((FROM | IN) multipartIdentifier)? (LIKE? pattern=STRING)? #showTables | SHOW TABLE EXTENDED ((FROM | IN) ns=multipartIdentifier)? - LIKE pattern=STRING partitionSpec? #showTable + LIKE pattern=STRING partitionSpec? #showTableExtended | SHOW TBLPROPERTIES table=multipartIdentifier ('(' key=tablePropertyKey ')')? #showTblProperties | SHOW COLUMNS (FROM | IN) table=multipartIdentifier diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 6541961f5613e..680ec982b2112 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -847,6 +847,8 @@ class Analyzer(override val catalogManager: CatalogManager) def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case s @ ShowTables(UnresolvedNamespace(Seq()), _) => s.copy(namespace = ResolvedNamespace(currentCatalog, catalogManager.currentNamespace)) + case s @ ShowTableExtended(UnresolvedNamespace(Seq()), _, _) => + s.copy(namespace = ResolvedNamespace(currentCatalog, catalogManager.currentNamespace)) case s @ ShowViews(UnresolvedNamespace(Seq()), _) => s.copy(namespace = ResolvedNamespace(currentCatalog, catalogManager.currentNamespace)) case UnresolvedNamespace(Seq()) => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 42c67ac963cbe..b6bd3b77fc874 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -3190,13 +3190,18 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Create a [[ShowTableStatement]] command. + * Create a [[ShowTableExtended]] command. */ - override def visitShowTable(ctx: ShowTableContext): LogicalPlan = withOrigin(ctx) { - ShowTableStatement( - Option(ctx.ns).map(visitMultipartIdentifier), + override def visitShowTableExtended( + ctx: ShowTableExtendedContext): LogicalPlan = withOrigin(ctx) { + val multiPart = Option(ctx.multipartIdentifier).map(visitMultipartIdentifier) + val partitionKeys = Option(ctx.partitionSpec).map { specCtx => + UnresolvedPartitionSpec(visitNonOptionalPartitionSpec(specCtx), None) + } + ShowTableExtended( + UnresolvedNamespace(multiPart.getOrElse(Seq.empty[String])), string(ctx.pattern), - Option(ctx.partitionSpec).map(visitNonOptionalPartitionSpec)) + partitionKeys) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala index 8f0889bbcebd8..402ae657d1709 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala @@ -377,15 +377,6 @@ case class InsertIntoStatement( override def children: Seq[LogicalPlan] = query :: Nil } -/** - * A SHOW TABLE EXTENDED statement, as parsed from SQL. - */ -case class ShowTableStatement( - namespace: Option[Seq[String]], - pattern: String, - partitionSpec: Option[TablePartitionSpec]) - extends ParsedStatement - /** * A CREATE NAMESPACE statement, as parsed from SQL. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala index 6f35364cce131..72ba9cf6db0e2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.connector.catalog._ import org.apache.spark.sql.connector.catalog.TableChange.{AddColumn, ColumnChange} import org.apache.spark.sql.connector.expressions.Transform -import org.apache.spark.sql.types.{DataType, MetadataBuilder, StringType, StructType} +import org.apache.spark.sql.types.{BooleanType, DataType, MetadataBuilder, StringType, StructType} /** * Base trait for DataSourceV2 write commands @@ -466,7 +466,7 @@ case class RenameTable( } /** - * The logical plan of the SHOW TABLE command. + * The logical plan of the SHOW TABLES command. */ case class ShowTables( namespace: LogicalPlan, @@ -478,6 +478,22 @@ case class ShowTables( AttributeReference("tableName", StringType, nullable = false)()) } +/** + * The logical plan of the SHOW TABLE EXTENDED command. + */ +case class ShowTableExtended( + namespace: LogicalPlan, + pattern: String, + partitionSpec: Option[PartitionSpec]) extends Command { + override def children: Seq[LogicalPlan] = namespace :: Nil + + override val output: Seq[Attribute] = Seq( + AttributeReference("namespace", StringType, nullable = false)(), + AttributeReference("tableName", StringType, nullable = false)(), + AttributeReference("isTemporary", BooleanType, nullable = false)(), + AttributeReference("information", StringType, nullable = false)()) +} + /** * The logical plan of the SHOW VIEWS command. * diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index 7e5f39e398a6b..4c7e6fefd9759 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -383,14 +383,20 @@ class ResolveSessionCatalog( } ShowTablesCommand(Some(ns.head), pattern) - case ShowTableStatement(ns, pattern, partitionsSpec) => - val db = ns match { - case Some(ns) if ns.length != 1 => - throw new AnalysisException( - s"The database name is not valid: ${ns.quoted}") - case _ => ns.map(_.head) + case ShowTableExtended( + SessionCatalogAndNamespace(_, ns), + pattern, + partitionSpec @ (None | Some(UnresolvedPartitionSpec(_, _)))) => + assert(ns.nonEmpty) + if (ns.length != 1) { + throw new AnalysisException( + s"The database name is not valid: ${ns.quoted}") } - ShowTablesCommand(db, Some(pattern), true, partitionsSpec) + ShowTablesCommand( + databaseName = Some(ns.head), + tableIdentifierPattern = Some(pattern), + isExtended = true, + partitionSpec.map(_.asInstanceOf[UnresolvedPartitionSpec].spec)) // ANALYZE TABLE works on permanent views if the views are cached. case AnalyzeTable(ResolvedV1TableOrViewIdentifier(ident), partitionSpec, noScan) => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index 075d2a43dce4e..5f67b39b95c35 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -295,6 +295,9 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat case r @ ShowTables(ResolvedNamespace(catalog, ns), pattern) => ShowTablesExec(r.output, catalog.asTableCatalog, ns, pattern) :: Nil + case _: ShowTableExtended => + throw new AnalysisException("SHOW TABLE EXTENDED is not supported for v2 tables.") + case SetCatalogAndNamespace(catalogManager, catalogName, ns) => SetCatalogAndNamespaceExec(catalogManager, catalogName, ns) :: Nil diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowTablesParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowTablesParserSuite.scala index 16f3dea8d75ef..d68e1233f7ab2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowTablesParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowTablesParserSuite.scala @@ -17,9 +17,9 @@ package org.apache.spark.sql.execution.command -import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedNamespace} +import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedNamespace, UnresolvedPartitionSpec} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser.parsePlan -import org.apache.spark.sql.catalyst.plans.logical.{ShowTables, ShowTableStatement} +import org.apache.spark.sql.catalyst.plans.logical.{ShowTableExtended, ShowTables} import org.apache.spark.sql.test.SharedSparkSession class ShowTablesParserSuite extends AnalysisTest with SharedSparkSession { @@ -52,25 +52,32 @@ class ShowTablesParserSuite extends AnalysisTest with SharedSparkSession { test("show table extended") { comparePlans( parsePlan("SHOW TABLE EXTENDED LIKE '*test*'"), - ShowTableStatement(None, "*test*", None)) + ShowTableExtended(UnresolvedNamespace(Seq.empty[String]), "*test*", None)) comparePlans( parsePlan(s"SHOW TABLE EXTENDED FROM $catalog.ns1.ns2 LIKE '*test*'"), - ShowTableStatement(Some(Seq(catalog, "ns1", "ns2")), "*test*", None)) + ShowTableExtended(UnresolvedNamespace(Seq(catalog, "ns1", "ns2")), "*test*", None)) comparePlans( parsePlan(s"SHOW TABLE EXTENDED IN $catalog.ns1.ns2 LIKE '*test*'"), - ShowTableStatement(Some(Seq(catalog, "ns1", "ns2")), "*test*", None)) + ShowTableExtended(UnresolvedNamespace(Seq(catalog, "ns1", "ns2")), "*test*", None)) comparePlans( parsePlan("SHOW TABLE EXTENDED LIKE '*test*' PARTITION(ds='2008-04-09', hr=11)"), - ShowTableStatement(None, "*test*", Some(Map("ds" -> "2008-04-09", "hr" -> "11")))) + ShowTableExtended( + UnresolvedNamespace(Seq.empty[String]), + "*test*", + Some(UnresolvedPartitionSpec(Map("ds" -> "2008-04-09", "hr" -> "11"))))) comparePlans( parsePlan(s"SHOW TABLE EXTENDED FROM $catalog.ns1.ns2 LIKE '*test*' " + "PARTITION(ds='2008-04-09')"), - ShowTableStatement(Some(Seq(catalog, "ns1", "ns2")), "*test*", - Some(Map("ds" -> "2008-04-09")))) + ShowTableExtended( + UnresolvedNamespace(Seq(catalog, "ns1", "ns2")), + "*test*", + Some(UnresolvedPartitionSpec(Map("ds" -> "2008-04-09"))))) comparePlans( parsePlan(s"SHOW TABLE EXTENDED IN $catalog.ns1.ns2 LIKE '*test*' " + "PARTITION(ds='2008-04-09')"), - ShowTableStatement(Some(Seq(catalog, "ns1", "ns2")), "*test*", - Some(Map("ds" -> "2008-04-09")))) + ShowTableExtended( + UnresolvedNamespace(Seq(catalog, "ns1", "ns2")), + "*test*", + Some(UnresolvedPartitionSpec(Map("ds" -> "2008-04-09"))))) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowTablesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowTablesSuite.scala index aff1729a000b6..370c8358e64da 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowTablesSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowTablesSuite.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.command.v2 import org.apache.spark.SparkConf import org.apache.spark.sql.{AnalysisException, Row} -import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException import org.apache.spark.sql.connector.InMemoryTableCatalog import org.apache.spark.sql.execution.command import org.apache.spark.sql.test.SharedSparkSession @@ -74,7 +73,7 @@ class ShowTablesSuite extends command.ShowTablesSuiteBase with SharedSparkSessio val e = intercept[AnalysisException] { sql(sqlCommand) } - assert(e.message.contains(s"The database name is not valid: ${namespace}")) + assert(e.message.contains(s"SHOW TABLE EXTENDED is not supported for v2 tables")) } val namespace = s"$catalog.ns1.ns2" @@ -101,10 +100,10 @@ class ShowTablesSuite extends command.ShowTablesSuiteBase with SharedSparkSessio val table = "people" withTable(s"$catalog.$table") { sql(s"CREATE TABLE $catalog.$table (name STRING, id INT) $defaultUsing") - val errMsg = intercept[NoSuchDatabaseException] { + val errMsg = intercept[AnalysisException] { sql(s"SHOW TABLE EXTENDED FROM $catalog LIKE '*$table*'").collect() }.getMessage - assert(errMsg.contains(s"Database '$catalog' not found")) + assert(errMsg.contains("SHOW TABLE EXTENDED is not supported for v2 tables")) } } } From c05ee06f5b711dd261dc94a01b4ba4ffccdf2ea0 Mon Sep 17 00:00:00 2001 From: Terry Kim Date: Tue, 8 Dec 2020 14:07:58 +0000 Subject: [PATCH 144/150] [SPARK-33685][SQL] Migrate DROP VIEW command to use UnresolvedView to resolve the identifier ### What changes were proposed in this pull request? This PR introduces `UnresolvedView` in the resolution framework to resolve the identifier. This PR then migrates `DROP VIEW` to use `UnresolvedView` to resolve the table/view identifier. This allows consistent resolution rules (temp view first, etc.) to be applied for both v1/v2 commands. More info about the consistent resolution rule proposal can be found in [JIRA](https://issues.apache.org/jira/browse/SPARK-29900) or [proposal doc](https://docs.google.com/document/d/1hvLjGA8y_W_hhilpngXVub1Ebv8RsMap986nENCFnrg/edit?usp=sharing). ### Why are the changes needed? To use `UnresolvedView` for view resolution. Note that there is no resolution behavior change with this PR. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Updated existing tests. Closes #30636 from imback82/drop_view_v2. Authored-by: Terry Kim Signed-off-by: Wenchen Fan --- .../sql/catalyst/analysis/Analyzer.scala | 17 ++++++++-- .../sql/catalyst/analysis/CheckAnalysis.scala | 13 ++++++-- .../catalyst/analysis/ResolveCatalogs.scala | 5 --- ...cala => ResolveCommandsWithIfExists.scala} | 14 +++++---- .../catalyst/analysis/v2ResolutionPlans.scala | 13 ++++++++ .../sql/catalyst/parser/AstBuilder.scala | 9 ++++-- .../catalyst/plans/logical/statements.scala | 7 ----- .../catalyst/plans/logical/v2Commands.scala | 15 +++++++-- .../sql/catalyst/parser/DDLParserSuite.scala | 17 ++++++---- .../analysis/ResolveSessionCatalog.scala | 5 ++- .../datasources/v2/DataSourceV2Strategy.scala | 2 +- .../sql/connector/DataSourceV2SQLSuite.scala | 14 ++++----- .../sql/execution/command/DDLSuite.scala | 5 ++- .../command/PlanResolutionSuite.scala | 31 ++++++++++++++++--- .../sql/hive/execution/HiveDDLSuite.scala | 3 +- 15 files changed, 118 insertions(+), 52 deletions(-) rename sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/{ResolveNoopDropTable.scala => ResolveCommandsWithIfExists.scala} (63%) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 680ec982b2112..6b0cf4be7de74 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -150,7 +150,7 @@ object AnalysisContext { * [[UnresolvedRelation]]s into fully typed objects using information in a [[SessionCatalog]]. */ class Analyzer(override val catalogManager: CatalogManager) - extends RuleExecutor[LogicalPlan] with CheckAnalysis with LookupCatalog with SQLConfHelper { + extends RuleExecutor[LogicalPlan] with CheckAnalysis with SQLConfHelper { private val v1SessionCatalog: SessionCatalog = catalogManager.v1SessionCatalog @@ -277,7 +277,7 @@ class Analyzer(override val catalogManager: CatalogManager) TypeCoercion.typeCoercionRules ++ extendedResolutionRules : _*), Batch("Post-Hoc Resolution", Once, - Seq(ResolveNoopDropTable) ++ + Seq(ResolveCommandsWithIfExists) ++ postHocResolutionRules: _*), Batch("Normalize Alter Table", Once, ResolveAlterTableChanges), Batch("Remove Unresolved Hints", Once, @@ -889,6 +889,11 @@ class Analyzer(override val catalogManager: CatalogManager) u.failAnalysis(s"${ident.quoted} is a temp view. '$cmd' expects a table") } u + case u @ UnresolvedView(ident, _, _) => + lookupTempView(ident).map { _ => + ResolvedView(ident.asIdentifier, isTemp = true) + } + .getOrElse(u) case u @ UnresolvedTableOrView(ident, cmd, allowTempView) => lookupTempView(ident) .map { _ => @@ -1113,6 +1118,14 @@ class Analyzer(override val catalogManager: CatalogManager) case table => table }.getOrElse(u) + case u @ UnresolvedView(identifier, cmd, relationTypeMismatchHint) => + lookupTableOrView(identifier).map { + case v: ResolvedView => v + case _ => + u.failAnalysis(s"${identifier.quoted} is a table. '$cmd' expects a view." + + relationTypeMismatchHint.map(" " + _).getOrElse("")) + }.getOrElse(u) + case u @ UnresolvedTableOrView(identifier, _, _) => lookupTableOrView(identifier).getOrElse(u) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 9f5eefc744135..39cdea2bd4d2a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.optimizer.BooleanSimplification import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, TypeUtils} -import org.apache.spark.sql.connector.catalog.{SupportsAtomicPartitionManagement, SupportsPartitionManagement, Table} +import org.apache.spark.sql.connector.catalog.{LookupCatalog, SupportsAtomicPartitionManagement, SupportsPartitionManagement, Table} import org.apache.spark.sql.connector.catalog.TableChange.{AddColumn, After, ColumnPosition, DeleteColumn, RenameColumn, UpdateColumnComment, UpdateColumnNullability, UpdateColumnPosition, UpdateColumnType} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -34,7 +34,7 @@ import org.apache.spark.sql.types._ /** * Throws user facing errors when passed invalid queries that fail to analyze. */ -trait CheckAnalysis extends PredicateHelper { +trait CheckAnalysis extends PredicateHelper with LookupCatalog { protected def isView(nameParts: Seq[String]): Boolean @@ -104,6 +104,15 @@ trait CheckAnalysis extends PredicateHelper { case u: UnresolvedTable => u.failAnalysis(s"Table not found for '${u.commandName}': ${u.multipartIdentifier.quoted}") + case u @ UnresolvedView(NonSessionCatalogAndIdentifier(catalog, ident), cmd, _) => + u.failAnalysis( + s"Cannot specify catalog `${catalog.name}` for view ${ident.quoted} " + + "because view support in v2 catalog has not been implemented yet. " + + s"$cmd expects a view.") + + case u: UnresolvedView => + u.failAnalysis(s"View not found for '${u.commandName}': ${u.multipartIdentifier.quoted}") + case u: UnresolvedTableOrView => val viewStr = if (u.allowTempView) "view" else "permanent view" u.failAnalysis( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala index 6d89414ba106d..b4dfee1330036 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala @@ -187,11 +187,6 @@ class ResolveCatalogs(val catalogManager: CatalogManager) writeOptions = c.writeOptions, orCreate = c.orCreate) - case DropViewStatement(NonSessionCatalogAndTable(catalog, viewName), _) => - throw new AnalysisException( - s"Can not specify catalog `${catalog.name}` for view ${viewName.quoted} " + - s"because view support in catalog has not been implemented yet") - case c @ CreateNamespaceStatement(CatalogAndNamespace(catalog, ns), _, _) if !isSessionCatalog(catalog) => CreateNamespace(catalog.asNamespaceCatalog, ns, c.ifNotExists, c.properties) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveNoopDropTable.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCommandsWithIfExists.scala similarity index 63% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveNoopDropTable.scala rename to sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCommandsWithIfExists.scala index f9da9174f85e6..196a07a7f9904 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveNoopDropTable.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCommandsWithIfExists.scala @@ -17,17 +17,19 @@ package org.apache.spark.sql.catalyst.analysis -import org.apache.spark.sql.catalyst.plans.logical.{DropTable, LogicalPlan, NoopDropTable} +import org.apache.spark.sql.catalyst.plans.logical.{DropTable, DropView, LogicalPlan, NoopCommand} import org.apache.spark.sql.catalyst.rules.Rule /** - * A rule for handling [[DropTable]] logical plan when the table or temp view is not resolved. - * If "ifExists" flag is set to true, the plan is resolved to [[NoopDropTable]], - * which is a no-op command. + * A rule for handling commands when the table or temp view is not resolved. + * These commands support a flag, "ifExists", so that they do not fail when a relation is not + * resolved. If the "ifExists" flag is set to true. the plan is resolved to [[NoopCommand]], */ -object ResolveNoopDropTable extends Rule[LogicalPlan] { +object ResolveCommandsWithIfExists extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { case DropTable(u: UnresolvedTableOrView, ifExists, _) if ifExists => - NoopDropTable(u.multipartIdentifier) + NoopCommand("DROP TABLE", u.multipartIdentifier) + case DropView(u: UnresolvedView, ifExists) if ifExists => + NoopCommand("DROP VIEW", u.multipartIdentifier) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala index 1518f064d78db..2737b5d58bf42 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala @@ -45,6 +45,19 @@ case class UnresolvedTable( override def output: Seq[Attribute] = Nil } +/** + * Holds the name of a view that has yet to be looked up in a catalog. It will be resolved to + * [[ResolvedView]] during analysis. + */ +case class UnresolvedView( + multipartIdentifier: Seq[String], + commandName: String, + relationTypeMismatchHint: Option[String] = None) extends LeafNode { + override lazy val resolved: Boolean = false + + override def output: Seq[Attribute] = Nil +} + /** * Holds the name of a table or view that has yet to be looked up in a catalog. It will * be resolved to [[ResolvedTable]] or [[ResolvedView]] during analysis. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index b6bd3b77fc874..89b81ec1d83aa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -3155,11 +3155,14 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Create a [[DropViewStatement]] command. + * Create a [[DropView]] command. */ override def visitDropView(ctx: DropViewContext): AnyRef = withOrigin(ctx) { - DropViewStatement( - visitMultipartIdentifier(ctx.multipartIdentifier()), + DropView( + UnresolvedView( + visitMultipartIdentifier(ctx.multipartIdentifier()), + "DROP VIEW", + Some("Please use DROP TABLE instead.")), ctx.EXISTS != null) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala index 402ae657d1709..c4ac8ea8f2e69 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala @@ -338,13 +338,6 @@ case class AlterViewAsStatement( originalText: String, query: LogicalPlan) extends ParsedStatement -/** - * A DROP VIEW statement, as parsed from SQL. - */ -case class DropViewStatement( - viewName: Seq[String], - ifExists: Boolean) extends ParsedStatement - /** * An INSERT INTO statement, as parsed from SQL. * diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala index 72ba9cf6db0e2..1e17c51137a55 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala @@ -419,9 +419,11 @@ case class DropTable( } /** - * The logical plan for handling non-existing table for DROP TABLE command. + * The logical plan for no-op command handling non-existing table. */ -case class NoopDropTable(multipartIdentifier: Seq[String]) extends Command +case class NoopCommand( + commandName: String, + multipartIdentifier: Seq[String]) extends Command /** * The logical plan of the ALTER TABLE command. @@ -724,3 +726,12 @@ case class ShowPartitions( override val output: Seq[Attribute] = Seq( AttributeReference("partition", StringType, nullable = false)()) } + +/** + * The logical plan of the DROP VIEW command. + */ +case class DropView( + child: LogicalPlan, + ifExists: Boolean) extends Command { + override def children: Seq[LogicalPlan] = child :: Nil +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala index f925be8617b47..d5b27d9ad25cf 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.parser import java.util.Locale import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, GlobalTempView, LocalTempView, PersistedView, UnresolvedAttribute, UnresolvedFunc, UnresolvedNamespace, UnresolvedPartitionSpec, UnresolvedRelation, UnresolvedStar, UnresolvedTable, UnresolvedTableOrView} +import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, GlobalTempView, LocalTempView, PersistedView, UnresolvedAttribute, UnresolvedFunc, UnresolvedNamespace, UnresolvedPartitionSpec, UnresolvedRelation, UnresolvedStar, UnresolvedTable, UnresolvedTableOrView, UnresolvedView} import org.apache.spark.sql.catalyst.catalog.{ArchiveResource, BucketSpec, FileResource, FunctionResource, JarResource} import org.apache.spark.sql.catalyst.expressions.{EqualTo, Literal} import org.apache.spark.sql.catalyst.plans.logical._ @@ -721,13 +721,18 @@ class DDLParserSuite extends AnalysisTest { } test("drop view") { + val cmd = "DROP VIEW" + val hint = Some("Please use DROP TABLE instead.") parseCompare(s"DROP VIEW testcat.db.view", - DropViewStatement(Seq("testcat", "db", "view"), ifExists = false)) - parseCompare(s"DROP VIEW db.view", DropViewStatement(Seq("db", "view"), ifExists = false)) + DropView(UnresolvedView(Seq("testcat", "db", "view"), cmd, hint), ifExists = false)) + parseCompare(s"DROP VIEW db.view", + DropView(UnresolvedView(Seq("db", "view"), cmd, hint), ifExists = false)) parseCompare(s"DROP VIEW IF EXISTS db.view", - DropViewStatement(Seq("db", "view"), ifExists = true)) - parseCompare(s"DROP VIEW view", DropViewStatement(Seq("view"), ifExists = false)) - parseCompare(s"DROP VIEW IF EXISTS view", DropViewStatement(Seq("view"), ifExists = true)) + DropView(UnresolvedView(Seq("db", "view"), cmd, hint), ifExists = true)) + parseCompare(s"DROP VIEW view", + DropView(UnresolvedView(Seq("view"), cmd, hint), ifExists = false)) + parseCompare(s"DROP VIEW IF EXISTS view", + DropView(UnresolvedView(Seq("view"), cmd, hint), ifExists = true)) } private def testCreateOrReplaceDdl( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index 4c7e6fefd9759..657764832a931 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -352,9 +352,8 @@ class ResolveSessionCatalog( } DropTableCommand(r.identifier.asTableIdentifier, ifExists, isView = false, purge = purge) - // v1 DROP TABLE supports temp view. - case DropViewStatement(TempViewOrV1Table(name), ifExists) => - DropTableCommand(name.asTableIdentifier, ifExists, isView = true, purge = false) + case DropView(r: ResolvedView, ifExists) => + DropTableCommand(r.identifier.asTableIdentifier, ifExists, isView = true, purge = false) case c @ CreateNamespaceStatement(CatalogAndNamespace(catalog, ns), _, _) if isSessionCatalog(catalog) => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index 5f67b39b95c35..7e2a485dcb4cc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -251,7 +251,7 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat case DropTable(r: ResolvedTable, ifExists, purge) => DropTableExec(r.catalog, r.identifier, ifExists, purge, invalidateCache(r)) :: Nil - case _: NoopDropTable => + case _: NoopCommand => LocalTableScanExec(Nil, Nil) :: Nil case AlterTable(catalog, ident, _, changes) => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index 2673577aecf36..9a8c3e3cf1a11 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -2594,6 +2594,13 @@ class DataSourceV2SQLSuite } } + test("DROP VIEW is not supported for v2 catalogs") { + assertAnalysisError( + "DROP VIEW testcat.v", + "Cannot specify catalog `testcat` for view v because view support in v2 catalog " + + "has not been implemented yet. DROP VIEW expects a view.") + } + private def testNotSupportedV2Command( sqlCommand: String, sqlParams: String, @@ -2612,13 +2619,6 @@ class DataSourceV2SQLSuite assert(e.message.contains(s"$sqlCommand is only supported with v1 tables")) } - private def testV1CommandSupportingTempView(sqlCommand: String, sqlParams: String): Unit = { - val e = intercept[AnalysisException] { - sql(s"$sqlCommand $sqlParams") - } - assert(e.message.contains(s"$sqlCommand is only supported with temp views or v1 tables")) - } - private def assertAnalysisError(sqlStatement: String, expectedError: String): Unit = { val errMsg = intercept[AnalysisException] { sql(sqlStatement) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index 4f79e71419a10..b3cd9f1057a70 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -1363,12 +1363,11 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { createDatabase(catalog, "dbx") createTable(catalog, tableIdent) assert(catalog.listTables("dbx") == Seq(tableIdent)) - val e = intercept[AnalysisException] { sql("DROP VIEW dbx.tab1") } - assert( - e.getMessage.contains("Cannot drop a table with DROP VIEW. Please use DROP TABLE instead")) + assert(e.getMessage.contains( + "dbx.tab1 is a table. 'DROP VIEW' expects a view. Please use DROP TABLE instead.")) } protected def testSetProperties(isDatasourceTable: Boolean): Unit = { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala index 38719311f1aef..5147a8485ea25 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala @@ -78,6 +78,14 @@ class PlanResolutionSuite extends AnalysisTest { V1Table(t) } + private val view: V1Table = { + val t = mock(classOf[CatalogTable]) + when(t.schema).thenReturn(new StructType().add("i", "int").add("s", "string")) + when(t.tableType).thenReturn(CatalogTableType.VIEW) + when(t.provider).thenReturn(Some(v1Format)) + V1Table(t) + } + private val testCat: TableCatalog = { val newCatalog = mock(classOf[TableCatalog]) when(newCatalog.loadTable(any())).thenAnswer((invocation: InvocationOnMock) => { @@ -101,6 +109,7 @@ class PlanResolutionSuite extends AnalysisTest { case "v2Table" => table case "v2Table1" => table case "v2TableWithAcceptAnySchemaCapability" => tableWithAcceptAnySchemaCapability + case "view" => view case name => throw new NoSuchTableException(name) } }) @@ -148,7 +157,10 @@ class PlanResolutionSuite extends AnalysisTest { manager } - def parseAndResolve(query: String, withDefault: Boolean = false): LogicalPlan = { + def parseAndResolve( + query: String, + withDefault: Boolean = false, + checkAnalysis: Boolean = false): LogicalPlan = { val catalogManager = if (withDefault) { catalogManagerWithDefault } else { @@ -158,8 +170,13 @@ class PlanResolutionSuite extends AnalysisTest { override val extendedResolutionRules: Seq[Rule[LogicalPlan]] = Seq( new ResolveSessionCatalog(catalogManager, _ == Seq("v"), _ => false)) } - // We don't check analysis here, as we expect the plan to be unresolved such as `CreateTable`. - analyzer.execute(CatalystSqlParser.parsePlan(query)) + // We don't check analysis here by default, as we expect the plan to be unresolved + // such as `CreateTable`. + val analyzed = analyzer.execute(CatalystSqlParser.parsePlan(query)) + if (checkAnalysis) { + analyzer.checkAnalysis(analyzed) + } + analyzed } private def parseResolveCompare(query: String, expected: LogicalPlan): Unit = @@ -677,6 +694,8 @@ class PlanResolutionSuite extends AnalysisTest { val viewIdent1 = TableIdentifier("view", Option("db")) val viewName2 = "view" val viewIdent2 = TableIdentifier("view", Option("default")) + val tempViewName = "v" + val tempViewIdent = TableIdentifier("v") parseResolveCompare(s"DROP VIEW $viewName1", DropTableCommand(viewIdent1, ifExists = false, isView = true, purge = false)) @@ -686,11 +705,15 @@ class PlanResolutionSuite extends AnalysisTest { DropTableCommand(viewIdent2, ifExists = false, isView = true, purge = false)) parseResolveCompare(s"DROP VIEW IF EXISTS $viewName2", DropTableCommand(viewIdent2, ifExists = true, isView = true, purge = false)) + parseResolveCompare(s"DROP VIEW $tempViewName", + DropTableCommand(tempViewIdent, ifExists = false, isView = true, purge = false)) + parseResolveCompare(s"DROP VIEW IF EXISTS $tempViewName", + DropTableCommand(tempViewIdent, ifExists = true, isView = true, purge = false)) } test("drop view in v2 catalog") { intercept[AnalysisException] { - parseAndResolve("DROP VIEW testcat.db.view") + parseAndResolve("DROP VIEW testcat.db.view", checkAnalysis = true) }.getMessage.toLowerCase(Locale.ROOT).contains( "view support in catalog has not been implemented") } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index ce31e39985971..d6a4d76386889 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -1048,7 +1048,8 @@ class HiveDDLSuite val message = intercept[AnalysisException] { sql("DROP VIEW tab1") }.getMessage - assert(message.contains("Cannot drop a table with DROP VIEW. Please use DROP TABLE instead")) + assert(message.contains( + "tab1 is a table. 'DROP VIEW' expects a view. Please use DROP TABLE instead.")) } } From a093d6feefb0e086d19c86ae53bf92df12ccf2fa Mon Sep 17 00:00:00 2001 From: Josh Soref Date: Tue, 8 Dec 2020 08:57:13 -0600 Subject: [PATCH 145/150] [MINOR] Spelling sql/core ### What changes were proposed in this pull request? This PR intends to fix typos in the sub-modules: * `sql/core` Split per srowen https://github.com/apache/spark/pull/30323#issuecomment-728981618 NOTE: The misspellings have been reported at https://github.com/jsoref/spark/commit/706a726f87a0bbf5e31467fae9015218773db85b#commitcomment-44064356 ### Why are the changes needed? Misspelled words make it harder to read / understand content. ### Does this PR introduce _any_ user-facing change? There are various fixes to documentation, etc... ### How was this patch tested? No testing was performed Closes #30531 from jsoref/spelling-sql-core. Authored-by: Josh Soref Signed-off-by: Sean Owen --- .../sql/execution/ui/static/spark-sql-viz.js | 8 ++--- .../scala/org/apache/spark/sql/Dataset.scala | 10 +++--- .../sql/execution/DataSourceScanExec.scala | 6 ++-- .../spark/sql/execution/ExplainUtils.scala | 8 ++--- .../ExternalAppendOnlyUnsafeRowArray.scala | 2 +- .../spark/sql/execution/SparkSqlParser.scala | 14 ++++---- .../sql/execution/WholeStageCodegenExec.scala | 2 +- .../adaptive/AdaptiveSparkPlanHelper.scala | 2 +- .../InsertIntoDataSourceDirCommand.scala | 2 +- .../spark/sql/execution/command/ddl.scala | 4 +-- .../spark/sql/execution/command/tables.scala | 2 +- .../execution/datasources/DataSource.scala | 2 +- .../datasources/FileFormatDataWriter.scala | 14 ++++---- .../datasources/FileFormatWriter.scala | 2 +- .../datasources/PartitioningUtils.scala | 2 +- .../v2/WriteToDataSourceV2Exec.scala | 2 +- .../sql/execution/joins/HashedRelation.scala | 4 +-- .../execution/python/ExtractPythonUDFs.scala | 6 ++-- .../streaming/CompactibleFileStreamLog.scala | 2 +- .../execution/streaming/StreamExecution.scala | 2 +- .../FlatMapGroupsWithStateExecHelper.scala | 2 +- .../apache/spark/sql/internal/HiveSerDe.scala | 2 +- .../sql/streaming/DataStreamWriter.scala | 4 +-- .../sql/Java8DatasetAggregatorSuite.java | 16 +++++----- .../spark/sql/JavaDatasetAggregatorSuite.java | 24 +++++++------- .../ansi/decimalArithmeticOperations.sql | 2 +- .../inputs/postgreSQL/create_view.sql | 2 +- .../apache/spark/sql/CachedTableSuite.scala | 8 ++--- .../org/apache/spark/sql/DataFrameSuite.scala | 2 +- .../apache/spark/sql/DatasetCacheSuite.scala | 13 ++++---- .../spark/sql/DatasetPrimitiveSuite.scala | 8 ++--- .../org/apache/spark/sql/DatasetSuite.scala | 32 +++++++++---------- .../apache/spark/sql/DateFunctionsSuite.scala | 6 ++-- .../org/apache/spark/sql/SQLQuerySuite.scala | 6 ++-- .../apache/spark/sql/SQLQueryTestSuite.scala | 10 +++--- .../sql/SparkSessionExtensionSuite.scala | 18 +++++------ .../apache/spark/sql/TPCDSTableStats.scala | 2 +- .../sql/connector/DataSourceV2SQLSuite.scala | 12 +++---- .../execution/SQLWindowFunctionSuite.scala | 2 +- .../sql/execution/SparkSqlParserSuite.scala | 2 +- .../execution/WholeStageCodegenSuite.scala | 4 +-- .../adaptive/AdaptiveQueryExecSuite.scala | 8 ++--- .../arrow/ArrowConvertersSuite.scala | 2 +- .../sql/execution/command/DDLSuite.scala | 12 +++---- .../command/PlanResolutionSuite.scala | 16 +++++----- .../datasources/DataSourceSuite.scala | 4 +-- .../datasources/SchemaPruningSuite.scala | 8 ++--- .../ParquetInteroperabilitySuite.scala | 2 +- .../ParquetPartitionDiscoverySuite.scala | 4 +-- .../parquet/ParquetQuerySuite.scala | 4 +-- .../exchange/EnsureRequirementsSuite.scala | 2 +- .../execution/metric/SQLMetricsSuite.scala | 2 +- .../streaming/HDFSMetadataLogSuite.scala | 2 +- .../sql/execution/ui/SparkPlanInfoSuite.scala | 6 ++-- .../internal/ExecutorSideSQLConfSuite.scala | 4 +-- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 12 +++---- .../spark/sql/sources/BucketedReadSuite.scala | 18 ++++++----- .../sources/CreateTableAsSelectSuite.scala | 2 +- .../spark/sql/sources/TableScanSuite.scala | 6 ++-- .../sql/streaming/FileStreamSourceSuite.scala | 4 +-- .../spark/sql/streaming/StreamSuite.scala | 8 ++--- .../test/DataStreamTableAPISuite.scala | 8 ++--- .../apache/spark/sql/test/SQLTestData.scala | 4 +-- .../spark/sql/test/SharedSparkSession.scala | 2 +- 64 files changed, 208 insertions(+), 205 deletions(-) diff --git a/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js b/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js index 301183f749a84..d1def1b0a42ff 100644 --- a/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js +++ b/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js @@ -87,14 +87,14 @@ function preprocessGraphLayout(g) { var node = g.node(nodes[i]); node.padding = "5"; - var firstSearator; + var firstSeparator; var secondSeparator; var splitter; if (node.isCluster) { - firstSearator = secondSeparator = labelSeparator; + firstSeparator = secondSeparator = labelSeparator; splitter = "\\n"; } else { - firstSearator = ""; + firstSeparator = ""; secondSeparator = ""; splitter = "
      "; } @@ -104,7 +104,7 @@ function preprocessGraphLayout(g) { if (newTexts) { node.label = node.label.replace( newTexts[0], - newTexts[1] + firstSearator + newTexts[2] + secondSeparator + newTexts[3]); + newTexts[1] + firstSeparator + newTexts[2] + secondSeparator + newTexts[3]); } }); } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 05d6647afd958..6afbbce3ff8d4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -1363,7 +1363,7 @@ class Dataset[T] private[sql]( // Attach the dataset id and column position to the column reference, so that we can detect // ambiguous self-join correctly. See the rule `DetectAmbiguousSelfJoin`. // This must be called before we return a `Column` that contains `AttributeReference`. - // Note that, the metadata added here are only avaiable in the analyzer, as the analyzer rule + // Note that, the metadata added here are only available in the analyzer, as the analyzer rule // `DetectAmbiguousSelfJoin` will remove it. private def addDataFrameIdToCol(expr: NamedExpression): NamedExpression = { val newExpr = expr transform { @@ -1665,10 +1665,10 @@ class Dataset[T] private[sql]( * See [[RelationalGroupedDataset]] for all the available aggregate functions. * * {{{ - * // Compute the average for all numeric columns rolluped by department and group. + * // Compute the average for all numeric columns rolled up by department and group. * ds.rollup($"department", $"group").avg() * - * // Compute the max age and average salary, rolluped by department and gender. + * // Compute the max age and average salary, rolled up by department and gender. * ds.rollup($"department", $"gender").agg(Map( * "salary" -> "avg", * "age" -> "max" @@ -1794,10 +1794,10 @@ class Dataset[T] private[sql]( * (i.e. cannot construct expressions). * * {{{ - * // Compute the average for all numeric columns rolluped by department and group. + * // Compute the average for all numeric columns rolled up by department and group. * ds.rollup("department", "group").avg() * - * // Compute the max age and average salary, rolluped by department and gender. + * // Compute the max age and average salary, rolled up by department and gender. * ds.rollup($"department", $"gender").agg(Map( * "salary" -> "avg", * "age" -> "max" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala index 44636beeec7fc..df3b9f2a4e9cb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala @@ -284,7 +284,7 @@ case class FileSourceScanExec( // // Sort ordering would be over the prefix subset of `sort columns` being read // from the table. - // eg. + // e.g. // Assume (col0, col2, col3) are the columns read from the table // If sort columns are (col0, col1), then sort ordering would be considered as (col0) // If sort columns are (col1, col0), then sort ordering would be empty as per rule #2 @@ -379,12 +379,12 @@ case class FileSourceScanExec( case (key, _) if (key.equals("Location")) => val location = relation.location val numPaths = location.rootPaths.length - val abbreviatedLoaction = if (numPaths <= 1) { + val abbreviatedLocation = if (numPaths <= 1) { location.rootPaths.mkString("[", ", ", "]") } else { "[" + location.rootPaths.head + s", ... ${numPaths - 1} entries]" } - s"$key: ${location.getClass.getSimpleName} ${redact(abbreviatedLoaction)}" + s"$key: ${location.getClass.getSimpleName} ${redact(abbreviatedLocation)}" case (key, value) => s"$key: ${redact(value)}" } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala index b54bd6a579b66..20e6fb6f96eaa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala @@ -28,14 +28,14 @@ import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, AdaptiveS object ExplainUtils extends AdaptiveSparkPlanHelper { /** * Given a input physical plan, performs the following tasks. - * 1. Computes the operator id for current operator and records it in the operaror + * 1. Computes the operator id for current operator and records it in the operator * by setting a tag. * 2. Computes the whole stage codegen id for current operator and records it in the * operator by setting a tag. * 3. Generate the two part explain output for this plan. * 1. First part explains the operator tree with each operator tagged with an unique * identifier. - * 2. Second part explans each operator in a verbose manner. + * 2. Second part explains each operator in a verbose manner. * * Note : This function skips over subqueries. They are handled by its caller. * @@ -117,7 +117,7 @@ object ExplainUtils extends AdaptiveSparkPlanHelper { } /** - * Traverses the supplied input plan in a bottem-up fashion does the following : + * Traverses the supplied input plan in a bottom-up fashion does the following : * 1. produces a map : operator identifier -> operator * 2. Records the operator id via setting a tag in the operator. * Note : @@ -210,7 +210,7 @@ object ExplainUtils extends AdaptiveSparkPlanHelper { /** * Given a input plan, returns an array of tuples comprising of : - * 1. Hosting opeator id. + * 1. Hosting operator id. * 2. Hosting expression * 3. Subquery plan */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArray.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArray.scala index 993627847c08c..c5e5de588ba9d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArray.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExternalAppendOnlyUnsafeRowArray.scala @@ -87,7 +87,7 @@ private[sql] class ExternalAppendOnlyUnsafeRowArray( def isEmpty: Boolean = numRows == 0 /** - * Clears up resources (eg. memory) held by the backing storage + * Clears up resources (e.g. memory) held by the backing storage */ def clear(): Unit = { if (spillableArray != null) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index c82e3818b48cc..7a31b0dcdd43d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -386,25 +386,25 @@ class SparkSqlAstBuilder extends AstBuilder { * - '/path/to/fileOrJar' */ override def visitManageResource(ctx: ManageResourceContext): LogicalPlan = withOrigin(ctx) { - val mayebePaths = if (ctx.STRING != null) string(ctx.STRING) else remainder(ctx.identifier).trim + val maybePaths = if (ctx.STRING != null) string(ctx.STRING) else remainder(ctx.identifier).trim ctx.op.getType match { case SqlBaseParser.ADD => ctx.identifier.getText.toLowerCase(Locale.ROOT) match { - case "file" => AddFileCommand(mayebePaths) - case "jar" => AddJarCommand(mayebePaths) + case "file" => AddFileCommand(maybePaths) + case "jar" => AddJarCommand(maybePaths) case other => operationNotAllowed(s"ADD with resource type '$other'", ctx) } case SqlBaseParser.LIST => ctx.identifier.getText.toLowerCase(Locale.ROOT) match { case "files" | "file" => - if (mayebePaths.length > 0) { - ListFilesCommand(mayebePaths.split("\\s+")) + if (maybePaths.length > 0) { + ListFilesCommand(maybePaths.split("\\s+")) } else { ListFilesCommand() } case "jars" | "jar" => - if (mayebePaths.length > 0) { - ListJarsCommand(mayebePaths.split("\\s+")) + if (maybePaths.length > 0) { + ListJarsCommand(maybePaths.split("\\s+")) } else { ListJarsCommand() } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala index b2963457e22db..c6ea99cfdad7b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala @@ -670,7 +670,7 @@ case class WholeStageCodegenExec(child: SparkPlan)(val codegenStageId: Int) } ${ctx.registerComment( - s"""Codegend pipeline for stage (id=$codegenStageId) + s"""Codegened pipeline for stage (id=$codegenStageId) |${this.treeString.trim}""".stripMargin, "wsc_codegenPipeline")} ${ctx.registerComment(s"codegenStageId=$codegenStageId", "wsc_codegenStageId", true)} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala index 6ba375910a4eb..eecfa40e8d0bd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala @@ -115,7 +115,7 @@ trait AdaptiveSparkPlanHelper { /** * Returns a sequence containing the subqueries in this plan, also including the (nested) - * subquries in its children + * subqueries in its children */ def subqueriesAll(p: SparkPlan): Seq[SparkPlan] = { val subqueries = flatMap(p)(_.subqueries) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala index 08d31fdda2dc8..d065bc0dab4cd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala @@ -36,7 +36,7 @@ import org.apache.spark.sql.execution.datasources._ * @param storage storage format used to describe how the query result is stored. * @param provider the data source type to be used * @param query the logical plan representing data to write to - * @param overwrite whthere overwrites existing directory + * @param overwrite whether overwrites existing directory */ case class InsertIntoDataSourceDirCommand( storage: CatalogStorageFormat, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala index 69425cfed285f..6d631e044e917 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala @@ -89,8 +89,8 @@ case class CreateDatabaseCommand( * A command for users to remove a database from the system. * * 'ifExists': - * - true, if database_name does't exist, no action - * - false (default), if database_name does't exist, a warning message will be issued + * - true, if database_name doesn't exist, no action + * - false (default), if database_name doesn't exist, a warning message will be issued * 'cascade': * - true, the dependent objects are automatically dropped before dropping database. * - false (default), it is in the Restrict mode. The database cannot be dropped if diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 640051384e94c..431a103063c68 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -352,7 +352,7 @@ case class LoadDataCommand( // entire string will be considered while making a Path instance,this is mainly done // by considering the wild card scenario in mind.as per old logic query param is // been considered while creating URI instance and if path contains wild card char '?' - // the remaining charecters after '?' will be removed while forming URI instance + // the remaining characters after '?' will be removed while forming URI instance LoadDataCommand.makeQualified(defaultFS, uriPath, loadPath) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index 34ded5d456d09..4783789b91f3e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -211,7 +211,7 @@ case class DataSource( s"Unable to infer schema for $format. It must be specified manually.") } - // We just print a waring message if the data schema and partition schema have the duplicate + // We just print a warning message if the data schema and partition schema have the duplicate // columns. This is because we allow users to do so in the previous Spark releases and // we have the existing tests for the cases (e.g., `ParquetHadoopFsRelationSuite`). // See SPARK-18108 and SPARK-21144 for related discussions. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala index edb49d3f90ca3..6de9b1d7cea4b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala @@ -167,7 +167,7 @@ class DynamicPartitionDataWriter( private var fileCounter: Int = _ private var recordsInFile: Long = _ - private var currentPartionValues: Option[UnsafeRow] = None + private var currentPartitionValues: Option[UnsafeRow] = None private var currentBucketId: Option[Int] = None /** Extracts the partition values out of an input row. */ @@ -247,11 +247,11 @@ class DynamicPartitionDataWriter( val nextPartitionValues = if (isPartitioned) Some(getPartitionValues(record)) else None val nextBucketId = if (isBucketed) Some(getBucketId(record)) else None - if (currentPartionValues != nextPartitionValues || currentBucketId != nextBucketId) { + if (currentPartitionValues != nextPartitionValues || currentBucketId != nextBucketId) { // See a new partition or bucket - write to a new partition dir (or a new bucket file). - if (isPartitioned && currentPartionValues != nextPartitionValues) { - currentPartionValues = Some(nextPartitionValues.get.copy()) - statsTrackers.foreach(_.newPartition(currentPartionValues.get)) + if (isPartitioned && currentPartitionValues != nextPartitionValues) { + currentPartitionValues = Some(nextPartitionValues.get.copy()) + statsTrackers.foreach(_.newPartition(currentPartitionValues.get)) } if (isBucketed) { currentBucketId = nextBucketId @@ -259,7 +259,7 @@ class DynamicPartitionDataWriter( } fileCounter = 0 - newOutputWriter(currentPartionValues, currentBucketId) + newOutputWriter(currentPartitionValues, currentBucketId) } else if (description.maxRecordsPerFile > 0 && recordsInFile >= description.maxRecordsPerFile) { // Exceeded the threshold in terms of the number of records per file. @@ -268,7 +268,7 @@ class DynamicPartitionDataWriter( assert(fileCounter < MAX_FILE_COUNTER, s"File counter $fileCounter is beyond max value $MAX_FILE_COUNTER") - newOutputWriter(currentPartionValues, currentBucketId) + newOutputWriter(currentPartitionValues, currentBucketId) } val outputRow = getOutputRow(record) currentWriter.write(outputRow) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala index a71aeb47872ce..48ebd6f0c610f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala @@ -164,7 +164,7 @@ object FileFormatWriter extends Logging { SQLExecution.checkSQLExecutionId(sparkSession) - // propagate the decription UUID into the jobs, so that committers + // propagate the description UUID into the jobs, so that committers // get an ID guaranteed to be unique. job.getConfiguration.set("spark.sql.sources.writeJobUUID", description.uuid) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala index ea437d200eaab..69123ee7af5b9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala @@ -453,7 +453,7 @@ object PartitioningUtils { val decimalTry = Try { // `BigDecimal` conversion can fail when the `field` is not a form of number. val bigDecimal = new JBigDecimal(raw) - // It reduces the cases for decimals by disallowing values having scale (eg. `1.1`). + // It reduces the cases for decimals by disallowing values having scale (e.g. `1.1`). require(bigDecimal.scale <= 0) // `DecimalType` conversion can fail when // 1. The precision is bigger than 38. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala index 47aad2bcb2c56..f5f77d38b8716 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala @@ -168,7 +168,7 @@ case class ReplaceTableAsSelectExec( * A new table will be created using the schema of the query, and rows from the query are appended. * If the table exists, its contents and schema should be replaced with the schema and the contents * of the query. This implementation is atomic. The table replacement is staged, and the commit - * operation at the end should perform tne replacement of the table's metadata and contents. If the + * operation at the end should perform the replacement of the table's metadata and contents. If the * write fails, the table is instructed to roll back staged changes and any previously written table * is left untouched. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala index 3c5ed40551206..a91cc0782e1f8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala @@ -426,9 +426,9 @@ private[joins] class UnsafeHashedRelation( readBuffer(valuesBuffer, 0, valuesSize) val loc = binaryMap.lookup(keyBuffer, Platform.BYTE_ARRAY_OFFSET, keySize) - val putSuceeded = loc.append(keyBuffer, Platform.BYTE_ARRAY_OFFSET, keySize, + val putSucceeded = loc.append(keyBuffer, Platform.BYTE_ARRAY_OFFSET, keySize, valuesBuffer, Platform.BYTE_ARRAY_OFFSET, valuesSize) - if (!putSuceeded) { + if (!putSucceeded) { binaryMap.free() throw new IOException("Could not allocate memory to grow BytesToBytesMap") } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala index dab2723d25726..b79bcd176b7b7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala @@ -102,7 +102,7 @@ object ExtractGroupingPythonUDFFromAggregate extends Rule[LogicalPlan] { case p: PythonUDF => // This is just a sanity check, the rule PullOutNondeterministic should // already pull out those nondeterministic expressions. - assert(p.udfDeterministic, "Non-determinstic PythonUDFs should not appear " + + assert(p.udfDeterministic, "Non-deterministic PythonUDFs should not appear " + "in grouping expression") val canonicalized = p.canonicalized.asInstanceOf[PythonUDF] if (attributeMap.contains(canonicalized)) { @@ -174,7 +174,7 @@ object ExtractPythonUDFs extends Rule[LogicalPlan] with PredicateHelper { } private def collectEvaluableUDFsFromExpressions(expressions: Seq[Expression]): Seq[PythonUDF] = { - // If fisrt UDF is SQL_SCALAR_PANDAS_ITER_UDF, then only return this UDF, + // If first UDF is SQL_SCALAR_PANDAS_ITER_UDF, then only return this UDF, // otherwise check if subsequent UDFs are of the same type as the first UDF. (since we can only // extract UDFs of the same eval type) @@ -268,7 +268,7 @@ object ExtractPythonUDFs extends Rule[LogicalPlan] with PredicateHelper { case PythonEvalType.SQL_SCALAR_PANDAS_UDF | PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF => ArrowEvalPython(validUdfs, resultAttrs, child, evalType) case _ => - throw new AnalysisException("Unexcepted UDF evalType") + throw new AnalysisException("Unexpected UDF evalType") } attributeMap ++= validUdfs.map(canonicalizeDeterministic).zip(resultAttrs) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala index 3c76306f20cd7..835c7c4d5261f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala @@ -288,7 +288,7 @@ abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag]( /** * Delete expired log entries that proceed the currentBatchId and retain - * sufficient minimum number of batches (given by minBatchsToRetain). This + * sufficient minimum number of batches (given by minBatchesToRetain). This * equates to retaining the earliest compaction log that proceeds * batch id position currentBatchId + 1 - minBatchesToRetain. All log entries * prior to the earliest compaction log proceeding that position will be removed. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala index d6be33c76e937..6b0d33b819a20 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala @@ -685,6 +685,6 @@ object StreamExecution { /** * A special thread to run the stream query. Some codes require to run in the QueryExecutionThread - * and will use `classOf[QueryxecutionThread]` to check. + * and will use `classOf[QueryExecutionThread]` to check. */ abstract class QueryExecutionThread(name: String) extends UninterruptibleThread(name) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/FlatMapGroupsWithStateExecHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/FlatMapGroupsWithStateExecHelper.scala index 0a16a3819b778..cc785ee4247c4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/FlatMapGroupsWithStateExecHelper.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/FlatMapGroupsWithStateExecHelper.scala @@ -77,7 +77,7 @@ object FlatMapGroupsWithStateExecHelper { // =========================== Private implementations of StateManager =========================== // =============================================================================================== - /** Commmon methods for StateManager implementations */ + /** Common methods for StateManager implementations */ private abstract class StateManagerImplBase(shouldStoreTimestamp: Boolean) extends StateManager { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala index 64b7e7fe7923a..cfcfeabbf1f6e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala @@ -65,7 +65,7 @@ object HiveSerDe { outputFormat = Option("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat"), serde = Option("org.apache.hadoop.hive.serde2.avro.AvroSerDe"))) - // `HiveSerDe` in `serdeMap` should be dintinct. + // `HiveSerDe` in `serdeMap` should be distinct. val serdeInverseMap: Map[HiveSerDe, String] = serdeMap.flatMap { case ("sequencefile", _) => None case ("rcfile", _) => None diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala index 01e626e5436a4..9e8dff37bcfd2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala @@ -387,8 +387,8 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { } val sink = new MemorySink() val resultDf = Dataset.ofRows(df.sparkSession, new MemoryPlan(sink, df.schema.toAttributes)) - val recoverFromChkpoint = outputMode == OutputMode.Complete() - val query = startQuery(sink, extraOptions, recoverFromCheckpoint = recoverFromChkpoint) + val recoverFromCheckpoint = outputMode == OutputMode.Complete() + val query = startQuery(sink, extraOptions, recoverFromCheckpoint = recoverFromCheckpoint) resultDf.createOrReplaceTempView(query.name) query } else if (source == SOURCE_NAME_FOREACH) { diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/Java8DatasetAggregatorSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/Java8DatasetAggregatorSuite.java index dd3755d3f904e..de88f80eb53b8 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/Java8DatasetAggregatorSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/Java8DatasetAggregatorSuite.java @@ -34,43 +34,43 @@ public class Java8DatasetAggregatorSuite extends JavaDatasetAggregatorSuiteBase @Test public void testTypedAggregationAverage() { KeyValueGroupedDataset> grouped = generateGroupedDataset(); - Dataset> agged = grouped.agg( + Dataset> aggregated = grouped.agg( org.apache.spark.sql.expressions.javalang.typed.avg(v -> (double)(v._2() * 2))); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 3.0), new Tuple2<>("b", 6.0)), - agged.collectAsList()); + aggregated.collectAsList()); } @SuppressWarnings("deprecation") @Test public void testTypedAggregationCount() { KeyValueGroupedDataset> grouped = generateGroupedDataset(); - Dataset> agged = grouped.agg( + Dataset> aggregated = grouped.agg( org.apache.spark.sql.expressions.javalang.typed.count(v -> v)); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 2L), new Tuple2<>("b", 1L)), - agged.collectAsList()); + aggregated.collectAsList()); } @SuppressWarnings("deprecation") @Test public void testTypedAggregationSumDouble() { KeyValueGroupedDataset> grouped = generateGroupedDataset(); - Dataset> agged = grouped.agg( + Dataset> aggregated = grouped.agg( org.apache.spark.sql.expressions.javalang.typed.sum(v -> (double)v._2())); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 3.0), new Tuple2<>("b", 3.0)), - agged.collectAsList()); + aggregated.collectAsList()); } @SuppressWarnings("deprecation") @Test public void testTypedAggregationSumLong() { KeyValueGroupedDataset> grouped = generateGroupedDataset(); - Dataset> agged = grouped.agg( + Dataset> aggregated = grouped.agg( org.apache.spark.sql.expressions.javalang.typed.sumLong(v -> (long)v._2())); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 3L), new Tuple2<>("b", 3L)), - agged.collectAsList()); + aggregated.collectAsList()); } } diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetAggregatorSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetAggregatorSuite.java index 8a90624f2070b..979b7751fa9a8 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetAggregatorSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetAggregatorSuite.java @@ -38,18 +38,18 @@ public class JavaDatasetAggregatorSuite extends JavaDatasetAggregatorSuiteBase { public void testTypedAggregationAnonClass() { KeyValueGroupedDataset> grouped = generateGroupedDataset(); - Dataset> agged = grouped.agg(new IntSumOf().toColumn()); + Dataset> aggregated = grouped.agg(new IntSumOf().toColumn()); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 3), new Tuple2<>("b", 3)), - agged.collectAsList()); + aggregated.collectAsList()); - Dataset> agged2 = grouped.agg(new IntSumOf().toColumn()) + Dataset> aggregated2 = grouped.agg(new IntSumOf().toColumn()) .as(Encoders.tuple(Encoders.STRING(), Encoders.INT())); Assert.assertEquals( Arrays.asList( new Tuple2<>("a", 3), new Tuple2<>("b", 3)), - agged2.collectAsList()); + aggregated2.collectAsList()); } static class IntSumOf extends Aggregator, Integer, Integer> { @@ -88,43 +88,43 @@ public Encoder outputEncoder() { @Test public void testTypedAggregationAverage() { KeyValueGroupedDataset> grouped = generateGroupedDataset(); - Dataset> agged = grouped.agg( + Dataset> aggregated = grouped.agg( org.apache.spark.sql.expressions.javalang.typed.avg(value -> value._2() * 2.0)); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 3.0), new Tuple2<>("b", 6.0)), - agged.collectAsList()); + aggregated.collectAsList()); } @SuppressWarnings("deprecation") @Test public void testTypedAggregationCount() { KeyValueGroupedDataset> grouped = generateGroupedDataset(); - Dataset> agged = grouped.agg( + Dataset> aggregated = grouped.agg( org.apache.spark.sql.expressions.javalang.typed.count(value -> value)); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 2L), new Tuple2<>("b", 1L)), - agged.collectAsList()); + aggregated.collectAsList()); } @SuppressWarnings("deprecation") @Test public void testTypedAggregationSumDouble() { KeyValueGroupedDataset> grouped = generateGroupedDataset(); - Dataset> agged = grouped.agg( + Dataset> aggregated = grouped.agg( org.apache.spark.sql.expressions.javalang.typed.sum(value -> (double) value._2())); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 3.0), new Tuple2<>("b", 3.0)), - agged.collectAsList()); + aggregated.collectAsList()); } @SuppressWarnings("deprecation") @Test public void testTypedAggregationSumLong() { KeyValueGroupedDataset> grouped = generateGroupedDataset(); - Dataset> agged = grouped.agg( + Dataset> aggregated = grouped.agg( org.apache.spark.sql.expressions.javalang.typed.sumLong(value -> (long) value._2())); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 3L), new Tuple2<>("b", 3L)), - agged.collectAsList()); + aggregated.collectAsList()); } } diff --git a/sql/core/src/test/resources/sql-tests/inputs/ansi/decimalArithmeticOperations.sql b/sql/core/src/test/resources/sql-tests/inputs/ansi/decimalArithmeticOperations.sql index d190f38345d6b..d843847e6a149 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/ansi/decimalArithmeticOperations.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/ansi/decimalArithmeticOperations.sql @@ -1,6 +1,6 @@ -- SPARK-23179: SQL ANSI 2011 states that in case of overflow during arithmetic operations, -- an exception should be thrown instead of returning NULL. --- This is what most of the SQL DBs do (eg. SQLServer, DB2). +-- This is what most of the SQL DBs do (e.g. SQLServer, DB2). -- tests for decimals handling in operations create table decimals_test(id int, a decimal(38,18), b decimal(38,18)) using parquet; diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/create_view.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/create_view.sql index 21ffd85f7d01f..2889941c1fcc1 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/create_view.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/create_view.sql @@ -636,7 +636,7 @@ DESC TABLE vv6; -- Check cases involving dropped/altered columns in a function's rowtype result -- --- Skip the tests below because Spark does't support PostgreSQL-specific UDFs/transactions +-- Skip the tests below because Spark doesn't support PostgreSQL-specific UDFs/transactions -- create table tt14t (f1 text, f2 text, f3 text, f4 text); -- insert into tt14t values('foo', 'bar', 'baz', '42'); -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index d0150616cd67e..3765093f83bc2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala @@ -835,7 +835,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils } } - test("SPARK-19993 nested subquery caching and scalar + predicate subqueris") { + test("SPARK-19993 nested subquery caching and scalar + predicate subqueries") { withTempView("t1", "t2", "t3", "t4") { Seq(1).toDF("c1").createOrReplaceTempView("t1") Seq(2).toDF("c1").createOrReplaceTempView("t2") @@ -886,17 +886,17 @@ class CachedTableSuite extends QueryTest with SQLTestUtils } private def checkIfNoJobTriggered[T](f: => T): T = { - var numJobTrigered = 0 + var numJobTriggered = 0 val jobListener = new SparkListener { override def onJobStart(jobStart: SparkListenerJobStart): Unit = { - numJobTrigered += 1 + numJobTriggered += 1 } } sparkContext.addSparkListener(jobListener) try { val result = f sparkContext.listenerBus.waitUntilEmpty() - assert(numJobTrigered === 0) + assert(numJobTriggered === 0) result } finally { sparkContext.removeSparkListener(jobListener) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index a45bf12e8f841..4fecd625031ba 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -805,7 +805,7 @@ class DataFrameSuite extends QueryTest assert(df2.drop("`a.b`").columns.size == 2) } - test("drop(name: String) search and drop all top level columns that matchs the name") { + test("drop(name: String) search and drop all top level columns that matches the name") { val df1 = Seq((1, 2)).toDF("a", "b") val df2 = Seq((3, 4)).toDF("a", "b") checkAnswer(df1.crossJoin(df2), Row(1, 2, 3, 4)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala index 5c144dad23c30..009ccb9a45354 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala @@ -102,18 +102,19 @@ class DatasetCacheSuite extends QueryTest test("persist and then groupBy columns asKey, map") { val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS() val grouped = ds.groupByKey(_._1) - val agged = grouped.mapGroups { (g, iter) => (g, iter.map(_._2).sum) } - agged.persist() + val aggregated = grouped.mapGroups { (g, iter) => (g, iter.map(_._2).sum) } + aggregated.persist() checkDataset( - agged.filter(_._1 == "b"), + aggregated.filter(_._1 == "b"), ("b", 3)) - assertCached(agged.filter(_._1 == "b")) + assertCached(aggregated.filter(_._1 == "b")) ds.unpersist(blocking = true) assert(ds.storageLevel == StorageLevel.NONE, "The Dataset ds should not be cached.") - agged.unpersist(blocking = true) - assert(agged.storageLevel == StorageLevel.NONE, "The Dataset agged should not be cached.") + aggregated.unpersist(blocking = true) + assert(aggregated.storageLevel == StorageLevel.NONE, + "The Dataset aggregated should not be cached.") } test("persist and then withColumn") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala index ac51634febc99..8547d96e0f457 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala @@ -170,23 +170,23 @@ class DatasetPrimitiveSuite extends QueryTest with SharedSparkSession { test("groupBy function, map") { val ds = Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11).toDS() val grouped = ds.groupByKey(_ % 2) - val agged = grouped.mapGroups { (g, iter) => + val aggregated = grouped.mapGroups { (g, iter) => val name = if (g == 0) "even" else "odd" (name, iter.size) } checkDatasetUnorderly( - agged, + aggregated, ("even", 5), ("odd", 6)) } test("groupBy function, flatMap") { val ds = Seq("a", "b", "c", "xyz", "hello").toDS() val grouped = ds.groupByKey(_.length) - val agged = grouped.flatMapGroups { (g, iter) => Iterator(g.toString, iter.mkString) } + val aggregated = grouped.flatMapGroups { (g, iter) => Iterator(g.toString, iter.mkString) } checkDatasetUnorderly( - agged, + aggregated, "1", "abc", "3", "xyz", "5", "hello") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 953a58760cd5c..67e3ad6a80642 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -528,42 +528,42 @@ class DatasetSuite extends QueryTest test("groupBy function, map") { val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS() val grouped = ds.groupByKey(v => (v._1, "word")) - val agged = grouped.mapGroups { (g, iter) => (g._1, iter.map(_._2).sum) } + val aggregated = grouped.mapGroups { (g, iter) => (g._1, iter.map(_._2).sum) } checkDatasetUnorderly( - agged, + aggregated, ("a", 30), ("b", 3), ("c", 1)) } test("groupBy function, flatMap") { val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS() val grouped = ds.groupByKey(v => (v._1, "word")) - val agged = grouped.flatMapGroups { (g, iter) => + val aggregated = grouped.flatMapGroups { (g, iter) => Iterator(g._1, iter.map(_._2).sum.toString) } checkDatasetUnorderly( - agged, + aggregated, "a", "30", "b", "3", "c", "1") } test("groupBy function, mapValues, flatMap") { val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS() val keyValue = ds.groupByKey(_._1).mapValues(_._2) - val agged = keyValue.mapGroups { (g, iter) => (g, iter.sum) } - checkDataset(agged, ("a", 30), ("b", 3), ("c", 1)) + val aggregated = keyValue.mapGroups { (g, iter) => (g, iter.sum) } + checkDataset(aggregated, ("a", 30), ("b", 3), ("c", 1)) val keyValue1 = ds.groupByKey(t => (t._1, "key")).mapValues(t => (t._2, "value")) - val agged1 = keyValue1.mapGroups { (g, iter) => (g._1, iter.map(_._1).sum) } - checkDataset(agged1, ("a", 30), ("b", 3), ("c", 1)) + val aggregated1 = keyValue1.mapGroups { (g, iter) => (g._1, iter.map(_._1).sum) } + checkDataset(aggregated1, ("a", 30), ("b", 3), ("c", 1)) } test("groupBy function, reduce") { val ds = Seq("abc", "xyz", "hello").toDS() - val agged = ds.groupByKey(_.length).reduceGroups(_ + _) + val aggregated = ds.groupByKey(_.length).reduceGroups(_ + _) checkDatasetUnorderly( - agged, + aggregated, 3 -> "abcxyz", 5 -> "hello") } @@ -914,11 +914,11 @@ class DatasetSuite extends QueryTest test("grouping key and grouped value has field with same name") { val ds = Seq(ClassData("a", 1), ClassData("a", 2)).toDS() - val agged = ds.groupByKey(d => ClassNullableData(d.a, null)).mapGroups { + val aggregated = ds.groupByKey(d => ClassNullableData(d.a, null)).mapGroups { (key, values) => key.a + values.map(_.b).sum } - checkDataset(agged, "a3") + checkDataset(aggregated, "a3") } test("cogroup's left and right side has field with same name") { @@ -1286,7 +1286,7 @@ class DatasetSuite extends QueryTest Route("b", "c", 6)) val ds = sparkContext.parallelize(data).toDF.as[Route] - val grped = ds.map(r => GroupedRoutes(r.src, r.dest, Seq(r))) + val grouped = ds.map(r => GroupedRoutes(r.src, r.dest, Seq(r))) .groupByKey(r => (r.src, r.dest)) .reduceGroups { (g1: GroupedRoutes, g2: GroupedRoutes) => GroupedRoutes(g1.src, g1.dest, g1.routes ++ g2.routes) @@ -1303,7 +1303,7 @@ class DatasetSuite extends QueryTest implicit def ordering[GroupedRoutes]: Ordering[GroupedRoutes] = (x: GroupedRoutes, y: GroupedRoutes) => x.toString.compareTo(y.toString) - checkDatasetUnorderly(grped, expected: _*) + checkDatasetUnorderly(grouped, expected: _*) } test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") { @@ -1383,7 +1383,7 @@ class DatasetSuite extends QueryTest } } } else { - // Local checkpoints dont require checkpoint_dir + // Local checkpoints don't require checkpoint_dir f } } @@ -1474,7 +1474,7 @@ class DatasetSuite extends QueryTest } test("SPARK-18717: code generation works for both scala.collection.Map" + - " and scala.collection.imutable.Map") { + " and scala.collection.immutable.Map") { val ds = Seq(WithImmutableMap("hi", Map(42L -> "foo"))).toDS checkDataset(ds.map(t => t), WithImmutableMap("hi", Map(42L -> "foo"))) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala index 9caa4c0377009..d7bbf597ff983 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala @@ -454,7 +454,7 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { assert(e.getCause.isInstanceOf[IllegalArgumentException]) assert(e.getMessage.contains("You may get a different result due to the upgrading of Spark")) - // february + // February val x1 = "2016-02-29" val x2 = "2017-02-29" val df1 = Seq(x1, x2).toDF("x") @@ -629,7 +629,7 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { e.getMessage.contains("You may get a different result due to the upgrading of Spark")) } - // february + // February val y1 = "2016-02-29" val y2 = "2017-02-29" val ts5 = Timestamp.valueOf("2016-02-29 00:00:00") @@ -680,7 +680,7 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq( Row(secs(ts4.getTime)), Row(null), Row(secs(ts3.getTime)), Row(null))) - // february + // February val y1 = "2016-02-29" val y2 = "2017-02-29" val ts5 = Timestamp.valueOf("2016-02-29 00:00:00") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 2eeb729ece3fb..ebfe8bdd7a749 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -1316,7 +1316,7 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark ) } - test("oder by asc by default when not specify ascending and descending") { + test("order by asc by default when not specify ascending and descending") { checkAnswer( sql("SELECT a, b FROM testData2 ORDER BY a desc, b"), Seq(Row(3, 1), Row(3, 2), Row(2, 1), Row(2, 2), Row(1, 1), Row(1, 2)) @@ -2812,7 +2812,7 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark } } - test("SRARK-22266: the same aggregate function was calculated multiple times") { + test("SPARK-22266: the same aggregate function was calculated multiple times") { val query = "SELECT a, max(b+1), max(b+1) + 1 FROM testData2 GROUP BY a" val df = sql(query) val physical = df.queryExecution.sparkPlan @@ -3092,7 +3092,7 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark assert(scan.isInstanceOf[ParquetScan]) assert(scan.asInstanceOf[ParquetScan].pushedFilters === filters) case _ => - fail(s"unknow format $format") + fail(s"unknown format $format") } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala index 36e55c0994f18..02c6fba9725d3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala @@ -278,18 +278,18 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession with SQLHelper val allCode = importedCode ++ code val tempQueries = if (allCode.exists(_.trim.startsWith("--QUERY-DELIMITER"))) { // Although the loop is heavy, only used for bracketed comments test. - val querys = new ArrayBuffer[String] + val queries = new ArrayBuffer[String] val otherCodes = new ArrayBuffer[String] var tempStr = "" var start = false for (c <- allCode) { if (c.trim.startsWith("--QUERY-DELIMITER-START")) { start = true - querys ++= splitWithSemicolon(otherCodes.toSeq) + queries ++= splitWithSemicolon(otherCodes.toSeq) otherCodes.clear() } else if (c.trim.startsWith("--QUERY-DELIMITER-END")) { start = false - querys += s"\n${tempStr.stripSuffix(";")}" + queries += s"\n${tempStr.stripSuffix(";")}" tempStr = "" } else if (start) { tempStr += s"\n$c" @@ -298,9 +298,9 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession with SQLHelper } } if (otherCodes.nonEmpty) { - querys ++= splitWithSemicolon(otherCodes.toSeq) + queries ++= splitWithSemicolon(otherCodes.toSeq) } - querys.toSeq + queries.toSeq } else { splitWithSemicolon(allCode).toSeq } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala index 576ad26505d27..5e1c6ba92803d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala @@ -166,13 +166,13 @@ class SparkSessionExtensionSuite extends SparkFunSuite { // inject rule that will run during AQE query stage optimization and will verify that the // custom tags were written in the preparation phase extensions.injectColumnar(session => - MyColumarRule(MyNewQueryStageRule(), MyNewQueryStageRule())) + MyColumnarRule(MyNewQueryStageRule(), MyNewQueryStageRule())) } withSession(extensions) { session => session.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, true) assert(session.sessionState.queryStagePrepRules.contains(MyQueryStagePrepRule())) assert(session.sessionState.columnarRules.contains( - MyColumarRule(MyNewQueryStageRule(), MyNewQueryStageRule()))) + MyColumnarRule(MyNewQueryStageRule(), MyNewQueryStageRule()))) import session.sqlContext.implicits._ val data = Seq((100L), (200L), (300L)).toDF("vals").repartition(1) val df = data.selectExpr("vals + 1") @@ -205,12 +205,12 @@ class SparkSessionExtensionSuite extends SparkFunSuite { val extensions = create { extensions => extensions.injectColumnar(session => - MyColumarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule())) + MyColumnarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule())) } withSession(extensions) { session => session.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, enableAQE) assert(session.sessionState.columnarRules.contains( - MyColumarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule()))) + MyColumnarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule()))) import session.sqlContext.implicits._ // perform a join to inject a broadcast exchange val left = Seq((1, 50L), (2, 100L), (3, 150L)).toDF("l1", "l2") @@ -244,12 +244,12 @@ class SparkSessionExtensionSuite extends SparkFunSuite { .config(COLUMN_BATCH_SIZE.key, 2) .withExtensions { extensions => extensions.injectColumnar(session => - MyColumarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule())) } + MyColumnarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule())) } .getOrCreate() try { assert(session.sessionState.columnarRules.contains( - MyColumarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule()))) + MyColumnarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule()))) import session.sqlContext.implicits._ val input = Seq((100L), (200L), (300L)) @@ -277,7 +277,7 @@ class SparkSessionExtensionSuite extends SparkFunSuite { assert(session.sessionState.functionRegistry .lookupFunction(MyExtensions.myFunction._1).isDefined) assert(session.sessionState.columnarRules.contains( - MyColumarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule()))) + MyColumnarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule()))) } finally { stop(session) } @@ -824,7 +824,7 @@ case class MyPostRule() extends Rule[SparkPlan] { } } -case class MyColumarRule(pre: Rule[SparkPlan], post: Rule[SparkPlan]) extends ColumnarRule { +case class MyColumnarRule(pre: Rule[SparkPlan], post: Rule[SparkPlan]) extends ColumnarRule { override def preColumnarTransitions: Rule[SparkPlan] = pre override def postColumnarTransitions: Rule[SparkPlan] = post } @@ -838,7 +838,7 @@ class MyExtensions extends (SparkSessionExtensions => Unit) { e.injectOptimizerRule(MyRule) e.injectParser(MyParser) e.injectFunction(MyExtensions.myFunction) - e.injectColumnar(session => MyColumarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule())) + e.injectColumnar(session => MyColumnarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule())) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSTableStats.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSTableStats.scala index f39b4b8b56c2e..ee9cf7b67225f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSTableStats.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSTableStats.scala @@ -376,7 +376,7 @@ object TPCDSTableStats { "s_closed_date_sk" -> CatalogColumnStat(Some(70L), Some("2450823"), Some("2451313"), Some(296), Some(4), Some(4), None, CatalogColumnStat.VERSION), "s_store_id" -> CatalogColumnStat(Some(210L), None, None, Some(0), Some(16), Some(16), None, CatalogColumnStat.VERSION), "s_geography_class" -> CatalogColumnStat(Some(1L), None, None, Some(3), Some(7), Some(7), None, CatalogColumnStat.VERSION), - "s_tax_precentage" -> CatalogColumnStat(Some(12L), Some("0.00"), Some("0.11"), Some(5), Some(8), Some(8), None, CatalogColumnStat.VERSION) + "s_tax_percentage" -> CatalogColumnStat(Some(12L), Some("0.00"), Some("0.11"), Some(5), Some(8), Some(8), None, CatalogColumnStat.VERSION) )), "store_returns" -> CatalogStatistics(4837573440L, Some(28795080L), Map( "sr_item_sk" -> CatalogColumnStat(Some(197284L), Some("1"), Some("204000"), Some(0), Some(8), Some(8), None, CatalogColumnStat.VERSION), diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index 9a8c3e3cf1a11..b1d61658b8a8b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -285,7 +285,7 @@ class DataSourceV2SQLSuite } } - test("CreateTable/RepalceTable: invalid schema if has interval type") { + test("CreateTable/ReplaceTable: invalid schema if has interval type") { Seq("CREATE", "REPLACE").foreach { action => val e1 = intercept[AnalysisException]( sql(s"$action TABLE table_name (id int, value interval) USING $v2Format")) @@ -1360,9 +1360,9 @@ class DataSourceV2SQLSuite test("ShowNamespaces: default v2 catalog doesn't support namespace") { spark.conf.set( - "spark.sql.catalog.testcat_no_namspace", + "spark.sql.catalog.testcat_no_namespace", classOf[BasicInMemoryTableCatalog].getName) - spark.conf.set(SQLConf.DEFAULT_CATALOG.key, "testcat_no_namspace") + spark.conf.set(SQLConf.DEFAULT_CATALOG.key, "testcat_no_namespace") val exception = intercept[AnalysisException] { sql("SHOW NAMESPACES") @@ -1373,11 +1373,11 @@ class DataSourceV2SQLSuite test("ShowNamespaces: v2 catalog doesn't support namespace") { spark.conf.set( - "spark.sql.catalog.testcat_no_namspace", + "spark.sql.catalog.testcat_no_namespace", classOf[BasicInMemoryTableCatalog].getName) val exception = intercept[AnalysisException] { - sql("SHOW NAMESPACES in testcat_no_namspace") + sql("SHOW NAMESPACES in testcat_no_namespace") } assert(exception.getMessage.contains("does not support namespaces")) @@ -2268,7 +2268,7 @@ class DataSourceV2SQLSuite val e = intercept[AnalysisException] { // Since the following multi-part name starts with `globalTempDB`, it is resolved to - // the session catalog, not the `gloabl_temp` v2 catalog. + // the session catalog, not the `global_temp` v2 catalog. sql(s"CREATE TABLE $globalTempDB.ns1.ns2.tbl (id bigint, data string) USING json") } assert(e.message.contains( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala index 67ec1028f1998..eec396b2e3998 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLWindowFunctionSuite.scala @@ -372,7 +372,7 @@ class SQLWindowFunctionSuite extends QueryTest with SharedSparkSession { spark.catalog.dropTempView("nums") } - test("window function: mutiple window expressions specified by range in a single expression") { + test("window function: multiple window expressions specified by range in a single expression") { val nums = sparkContext.parallelize(1 to 10).map(x => (x, x % 2)).toDF("x", "y") nums.createOrReplaceTempView("nums") withTempView("nums") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala index 1a826c00c81f2..81ba09f206b92 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala @@ -184,7 +184,7 @@ class SparkSqlParserSuite extends AnalysisTest { intercept("REFRESH", "Resource paths cannot be empty in REFRESH statements") } - test("SPARK-33118 CREATE TMEPORARY TABLE with LOCATION") { + test("SPARK-33118 CREATE TEMPORARY TABLE with LOCATION") { assertEqual("CREATE TEMPORARY TABLE t USING parquet OPTIONS (path '/data/tmp/testspark1')", CreateTempViewUsing(TableIdentifier("t", None), None, false, false, "parquet", Map("path" -> "/data/tmp/testspark1"))) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala index fe40d7dce344d..eb5643df4c752 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala @@ -398,8 +398,8 @@ class WholeStageCodegenSuite extends QueryTest with SharedSparkSession // Case2: The parent of a LocalTableScanExec supports WholeStageCodegen. // In this case, the LocalTableScanExec should be within a WholeStageCodegen domain // and no more InputAdapter is inserted as the direct parent of the LocalTableScanExec. - val aggedDF = Seq(1, 2, 3).toDF.groupBy("value").sum() - val executedPlan = aggedDF.queryExecution.executedPlan + val aggregatedDF = Seq(1, 2, 3).toDF.groupBy("value").sum() + val executedPlan = aggregatedDF.queryExecution.executedPlan // HashAggregateExec supports WholeStageCodegen and it's the parent of // LocalTableScanExec so LocalTableScanExec should be within a WholeStageCodegen domain. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index 45ba2202d83d3..69f1565c2f8de 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -755,9 +755,9 @@ class AdaptiveQueryExecSuite Utils.deleteRecursively(tableDir) df1.write.parquet(tableDir.getAbsolutePath) - val agged = spark.table("bucketed_table").groupBy("i").count() + val aggregated = spark.table("bucketed_table").groupBy("i").count() val error = intercept[Exception] { - agged.count() + aggregated.count() } assert(error.getCause().toString contains "Invalid bucket file") assert(error.getSuppressed.size === 0) @@ -962,9 +962,9 @@ class AdaptiveQueryExecSuite withSQLConf(SQLConf.UI_EXPLAIN_MODE.key -> mode, SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "80") { - val dfApdaptive = sql("SELECT * FROM testData JOIN testData2 ON key = a WHERE value = '1'") + val dfAdaptive = sql("SELECT * FROM testData JOIN testData2 ON key = a WHERE value = '1'") try { - checkAnswer(dfApdaptive, Row(1, "1", 1, 1) :: Row(1, "1", 1, 2) :: Nil) + checkAnswer(dfAdaptive, Row(1, "1", 1, 1) :: Row(1, "1", 1, 2) :: Nil) spark.sparkContext.listenerBus.waitUntilEmpty() assert(checkDone) } finally { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala index 1e6e59456c887..d861bbbf67b1c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala @@ -1210,7 +1210,7 @@ class ArrowConvertersSuite extends SharedSparkSession { testQuietly("interval is unsupported for arrow") { val e = intercept[SparkException] { - calenderIntervalData.toDF().toArrowBatchRdd.collect() + calendarIntervalData.toDF().toArrowBatchRdd.collect() } assert(e.getCause.isInstanceOf[UnsupportedOperationException]) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index b3cd9f1057a70..82d3e2dfe2212 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -549,9 +549,9 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { import testImplicits._ val df = sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("num", "str") - // Case 1: with partitioning columns but no schema: Option("inexistentColumns") + // Case 1: with partitioning columns but no schema: Option("nonexistentColumns") // Case 2: without schema and partitioning columns: None - Seq(Option("inexistentColumns"), None).foreach { partitionCols => + Seq(Option("nonexistentColumns"), None).foreach { partitionCols => withTempPath { pathToPartitionedTable => df.write.format("parquet").partitionBy("num") .save(pathToPartitionedTable.getCanonicalPath) @@ -589,9 +589,9 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { import testImplicits._ val df = sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("num", "str") - // Case 1: with partitioning columns but no schema: Option("inexistentColumns") + // Case 1: with partitioning columns but no schema: Option("nonexistentColumns") // Case 2: without schema and partitioning columns: None - Seq(Option("inexistentColumns"), None).foreach { partitionCols => + Seq(Option("nonexistentColumns"), None).foreach { partitionCols => withTempPath { pathToNonPartitionedTable => df.write.format("parquet").save(pathToNonPartitionedTable.getCanonicalPath) checkSchemaInCreatedDataSourceTable( @@ -608,7 +608,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { import testImplicits._ val df = sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("num", "str") - // Case 1: with partitioning columns but no schema: Option("inexistentColumns") + // Case 1: with partitioning columns but no schema: Option("nonexistentColumns") // Case 2: without schema and partitioning columns: None Seq(Option("num"), None).foreach { partitionCols => withTempPath { pathToNonPartitionedTable => @@ -1910,7 +1910,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { |OPTIONS ( | path '${tempDir.getCanonicalPath}' |) - |CLUSTERED BY (inexistentColumnA) SORTED BY (inexistentColumnB) INTO 2 BUCKETS + |CLUSTERED BY (nonexistentColumnA) SORTED BY (nonexistentColumnB) INTO 2 BUCKETS """.stripMargin) } assert(e.message == "Cannot specify bucketing information if the table schema is not " + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala index 5147a8485ea25..758540f1a42f5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala @@ -1187,26 +1187,26 @@ class PlanResolutionSuite extends AnalysisTest { ) } - DSV2ResolutionTests.foreach { case (sql, isSessionCatlog) => + DSV2ResolutionTests.foreach { case (sql, isSessionCatalog) => test(s"Data source V2 relation resolution '$sql'") { val parsed = parseAndResolve(sql, withDefault = true) - val catlogIdent = if (isSessionCatlog) v2SessionCatalog else testCat - val tableIdent = if (isSessionCatlog) "v2Table" else "tab" + val catalogIdent = if (isSessionCatalog) v2SessionCatalog else testCat + val tableIdent = if (isSessionCatalog) "v2Table" else "tab" parsed match { case AlterTable(_, _, r: DataSourceV2Relation, _) => - assert(r.catalog.exists(_ == catlogIdent)) + assert(r.catalog.exists(_ == catalogIdent)) assert(r.identifier.exists(_.name() == tableIdent)) case Project(_, AsDataSourceV2Relation(r)) => - assert(r.catalog.exists(_ == catlogIdent)) + assert(r.catalog.exists(_ == catalogIdent)) assert(r.identifier.exists(_.name() == tableIdent)) case AppendData(r: DataSourceV2Relation, _, _, _) => - assert(r.catalog.exists(_ == catlogIdent)) + assert(r.catalog.exists(_ == catalogIdent)) assert(r.identifier.exists(_.name() == tableIdent)) case DescribeRelation(r: ResolvedTable, _, _) => - assert(r.catalog == catlogIdent) + assert(r.catalog == catalogIdent) assert(r.identifier.name() == tableIdent) case ShowTableProperties(r: ResolvedTable, _) => - assert(r.catalog == catlogIdent) + assert(r.catalog == catalogIdent) assert(r.identifier.name() == tableIdent) case ShowTablePropertiesCommand(t: TableIdentifier, _) => assert(t.identifier == tableIdent) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceSuite.scala index dc97b7a55ee9a..6ba3d2723412b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceSuite.scala @@ -141,14 +141,14 @@ class DataSourceSuite extends SharedSparkSession with PrivateMethodTester { } test("Data source options should be propagated in method checkAndGlobPathIfNecessary") { - val dataSourceOptions = Map("fs.defaultFS" -> "nonexistsFs://nonexistsFs") + val dataSourceOptions = Map("fs.defaultFS" -> "nonexistentFs://nonexistentFs") val dataSource = DataSource(spark, "parquet", Seq("/path3"), options = dataSourceOptions) val checkAndGlobPathIfNecessary = PrivateMethod[Seq[Path]]('checkAndGlobPathIfNecessary) val message = intercept[java.io.IOException] { dataSource invokePrivate checkAndGlobPathIfNecessary(false, false) }.getMessage - val expectMessage = "No FileSystem for scheme nonexistsFs" + val expectMessage = "No FileSystem for scheme nonexistentFs" assert(message.filterNot(Set(':', '"').contains) == expectMessage) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala index 2b5cb27d59ad9..c90732183cb7a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala @@ -623,9 +623,9 @@ abstract class SchemaPruningSuite spark.read.format(dataSourceName).schema(schema).load(path + "/contacts") .createOrReplaceTempView("contacts") - val departmentScahem = "`depId` INT,`depName` STRING,`contactId` INT, " + + val departmentSchema = "`depId` INT,`depName` STRING,`contactId` INT, " + "`employer` STRUCT<`id`: INT, `company`: STRUCT<`name`: STRING, `address`: STRING>>" - spark.read.format(dataSourceName).schema(departmentScahem).load(path + "/departments") + spark.read.format(dataSourceName).schema(departmentSchema).load(path + "/departments") .createOrReplaceTempView("departments") testThunk @@ -651,9 +651,9 @@ abstract class SchemaPruningSuite spark.read.format(dataSourceName).schema(schema).load(path + "/contacts") .createOrReplaceTempView("contacts") - val departmentScahem = "`depId` INT,`depName` STRING,`contactId` INT, " + + val departmentSchema = "`depId` INT,`depName` STRING,`contactId` INT, " + "`employer` STRUCT<`id`: INT, `company`: STRUCT<`name`: STRING, `address`: STRING>>" - spark.read.format(dataSourceName).schema(departmentScahem).load(path + "/departments") + spark.read.format(dataSourceName).schema(departmentSchema).load(path + "/departments") .createOrReplaceTempView("departments") testThunk diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala index 8c5f7bed7c50d..2fe5953cbe12e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala @@ -183,7 +183,7 @@ class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedS val oneBlockColumnMeta = oneBlockMeta.getColumns().get(0) // This is the important assert. Column stats are written, but they are ignored // when the data is read back as mentioned above, b/c int96 is unsigned. This - // assert makes sure this holds even if we change parquet versions (if eg. there + // assert makes sure this holds even if we change parquet versions (if e.g. there // were ever statistics even on unsigned columns). assert(!oneBlockColumnMeta.getStatistics.hasNonNullValue) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala index 5c41614c45b6f..400f4d8e1b156 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala @@ -1157,7 +1157,7 @@ class ParquetV1PartitionDiscoverySuite extends ParquetPartitionDiscoverySuite { test("SPARK-21463: MetadataLogFileIndex should respect userSpecifiedSchema for partition cols") { withTempDir { tempDir => val output = new File(tempDir, "output").toString - val checkpoint = new File(tempDir, "chkpoint").toString + val checkpoint = new File(tempDir, "checkpoint").toString try { val stream = MemoryStream[(String, Int)] val df = stream.toDS().toDF("time", "value") @@ -1303,7 +1303,7 @@ class ParquetV2PartitionDiscoverySuite extends ParquetPartitionDiscoverySuite { test("SPARK-21463: MetadataLogFileIndex should respect userSpecifiedSchema for partition cols") { withTempDir { tempDir => val output = new File(tempDir, "output").toString - val checkpoint = new File(tempDir, "chkpoint").toString + val checkpoint = new File(tempDir, "checkpoint").toString try { val stream = MemoryStream[(String, Int)] val df = stream.toDS().toDF("time", "value") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala index 05d305a9b52ba..8f85fe3c52583 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala @@ -857,7 +857,7 @@ class ParquetV1QuerySuite extends ParquetQuerySuite { val df = spark.range(10).select(Seq.tabulate(11) {i => ('id + i).as(s"c$i")} : _*) df.write.mode(SaveMode.Overwrite).parquet(path) - // donot return batch, because whole stage codegen is disabled for wide table (>200 columns) + // do not return batch - whole stage codegen is disabled for wide table (>200 columns) val df2 = spark.read.parquet(path) val fileScan2 = df2.queryExecution.sparkPlan.find(_.isInstanceOf[FileSourceScanExec]).get assert(!fileScan2.asInstanceOf[FileSourceScanExec].supportsColumnar) @@ -890,7 +890,7 @@ class ParquetV2QuerySuite extends ParquetQuerySuite { val df = spark.range(10).select(Seq.tabulate(11) {i => ('id + i).as(s"c$i")} : _*) df.write.mode(SaveMode.Overwrite).parquet(path) - // donot return batch, because whole stage codegen is disabled for wide table (>200 columns) + // do not return batch - whole stage codegen is disabled for wide table (>200 columns) val df2 = spark.read.parquet(path) val fileScan2 = df2.queryExecution.sparkPlan.find(_.isInstanceOf[BatchScanExec]).get val parquetScan2 = fileScan2.asInstanceOf[BatchScanExec].scan.asInstanceOf[ParquetScan] diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/exchange/EnsureRequirementsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/exchange/EnsureRequirementsSuite.scala index 296cbc3f3ad52..061799f439e5b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/exchange/EnsureRequirementsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/exchange/EnsureRequirementsSuite.scala @@ -60,7 +60,7 @@ class EnsureRequirementsSuite extends SharedSparkSession { case other => fail(other.toString) } - // Both sides are PartitioningCollection, but left side cannot be reorderd to match + // Both sides are PartitioningCollection, but left side cannot be reordered to match // and it should fall back to the right side. val smjExec3 = SortMergeJoinExec( exprA :: exprC :: Nil, exprB :: exprA :: Nil, Inner, None, plan1, plan1) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index b4f921efcac81..21d17f40abb34 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -181,7 +181,7 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils assert(probes.toDouble > 1.0) } else { val mainValue = probes.split("\n").apply(1).stripPrefix("(").stripSuffix(")") - // Extract min, med, max from the string and strip off everthing else. + // Extract min, med, max from the string and strip off everything else. val index = mainValue.indexOf(" (", 0) mainValue.slice(0, index).split(", ").foreach { probe => assert(probe.toDouble > 1.0) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala index 67dd88cbab63b..980d532dd4779 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala @@ -199,7 +199,7 @@ class HDFSMetadataLogSuite extends SharedSparkSession { intercept[IllegalStateException](verifyBatchIds(Seq(2, 3, 4), Some(1L), Some(5L))) intercept[IllegalStateException](verifyBatchIds(Seq(1, 2, 4, 5), Some(1L), Some(5L))) - // Related to SPARK-26629, this capatures the behavior for verifyBatchIds when startId > endId + // Related to SPARK-26629, this captures the behavior for verifyBatchIds when startId > endId intercept[IllegalStateException](verifyBatchIds(Seq(), Some(2L), Some(1L))) intercept[AssertionError](verifyBatchIds(Seq(2), Some(2L), Some(1L))) intercept[AssertionError](verifyBatchIds(Seq(1), Some(2L), Some(1L))) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SparkPlanInfoSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SparkPlanInfoSuite.scala index a702e00ff9f92..dfc64a41d9f86 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SparkPlanInfoSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SparkPlanInfoSuite.scala @@ -24,10 +24,10 @@ class SparkPlanInfoSuite extends SharedSparkSession{ import testImplicits._ - def vaidateSparkPlanInfo(sparkPlanInfo: SparkPlanInfo): Unit = { + def validateSparkPlanInfo(sparkPlanInfo: SparkPlanInfo): Unit = { sparkPlanInfo.nodeName match { case "InMemoryTableScan" => assert(sparkPlanInfo.children.length == 1) - case _ => sparkPlanInfo.children.foreach(vaidateSparkPlanInfo) + case _ => sparkPlanInfo.children.foreach(validateSparkPlanInfo) } } @@ -39,6 +39,6 @@ class SparkPlanInfoSuite extends SharedSparkSession{ val planInfoResult = SparkPlanInfo.fromSparkPlan(dfWithCache.queryExecution.executedPlan) - vaidateSparkPlanInfo(planInfoResult) + validateSparkPlanInfo(planInfoResult) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala index 567524ac75c2e..13b22dba1168b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala @@ -108,7 +108,7 @@ class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils { .queryExecution.executedPlan) assert(res.length == 2) assert(res.forall { case (_, code, _) => - (code.contains("* Codegend pipeline") == flag) && + (code.contains("* Codegened pipeline") == flag) && (code.contains("// input[") == flag) }) } @@ -175,7 +175,7 @@ class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils { df.hint("broadcast") } - // set local propert and assert + // set local property and assert val df2 = generateBroadcastDataFrame(confKey, confValue1) spark.sparkContext.setLocalProperty(confKey, confValue1) val checks = df1.join(df2).collect() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala index f0b19071a969b..ede5fe538a028 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala @@ -1418,7 +1418,7 @@ class JDBCSuite extends QueryTest } test("SPARK-24327 verify and normalize a partition column based on a JDBC resolved schema") { - def testJdbcParitionColumn(partColName: String, expectedColumnName: String): Unit = { + def testJdbcPartitionColumn(partColName: String, expectedColumnName: String): Unit = { val df = spark.read.format("jdbc") .option("url", urlWithUserAndPass) .option("dbtable", "TEST.PARTITION") @@ -1439,16 +1439,16 @@ class JDBCSuite extends QueryTest } } - testJdbcParitionColumn("THEID", "THEID") - testJdbcParitionColumn("\"THEID\"", "THEID") + testJdbcPartitionColumn("THEID", "THEID") + testJdbcPartitionColumn("\"THEID\"", "THEID") withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { - testJdbcParitionColumn("ThEiD", "THEID") + testJdbcPartitionColumn("ThEiD", "THEID") } - testJdbcParitionColumn("THE ID", "THE ID") + testJdbcPartitionColumn("THE ID", "THE ID") def testIncorrectJdbcPartitionColumn(partColName: String): Unit = { val errMsg = intercept[AnalysisException] { - testJdbcParitionColumn(partColName, "THEID") + testJdbcPartitionColumn(partColName, "THEID") }.getMessage assert(errMsg.contains(s"User-defined partition column $partColName not found " + "in the JDBC relation:")) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala index 0ff9303421ade..4ae8cdbeb4f1e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala @@ -639,13 +639,14 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { withTable("bucketed_table") { df1.write.format("parquet").bucketBy(8, "i", "j").saveAsTable("bucketed_table") val tbl = spark.table("bucketed_table") - val agged = tbl.groupBy("i", "j").agg(max("k")) + val aggregated = tbl.groupBy("i", "j").agg(max("k")) checkAnswer( - agged.sort("i", "j"), + aggregated.sort("i", "j"), df1.groupBy("i", "j").agg(max("k")).sort("i", "j")) - assert(agged.queryExecution.executedPlan.find(_.isInstanceOf[ShuffleExchangeExec]).isEmpty) + assert( + aggregated.queryExecution.executedPlan.find(_.isInstanceOf[ShuffleExchangeExec]).isEmpty) } } @@ -679,13 +680,14 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { withTable("bucketed_table") { df1.write.format("parquet").bucketBy(8, "i").saveAsTable("bucketed_table") val tbl = spark.table("bucketed_table") - val agged = tbl.groupBy("i", "j").agg(max("k")) + val aggregated = tbl.groupBy("i", "j").agg(max("k")) checkAnswer( - agged.sort("i", "j"), + aggregated.sort("i", "j"), df1.groupBy("i", "j").agg(max("k")).sort("i", "j")) - assert(agged.queryExecution.executedPlan.find(_.isInstanceOf[ShuffleExchangeExec]).isEmpty) + assert( + aggregated.queryExecution.executedPlan.find(_.isInstanceOf[ShuffleExchangeExec]).isEmpty) } } @@ -806,9 +808,9 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { Utils.deleteRecursively(tableDir) df1.write.parquet(tableDir.getAbsolutePath) - val agged = spark.table("bucketed_table").groupBy("i").count() + val aggregated = spark.table("bucketed_table").groupBy("i").count() val error = intercept[Exception] { - agged.count() + aggregated.count() } assert(error.getCause().toString contains "Invalid bucket file") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala index 9464f7e4c1241..9a7c7e0edc409 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala @@ -234,7 +234,7 @@ class CreateTableAsSelectSuite extends DataSourceTest with SharedSparkSession { } } - test("create table using as select - with overriden max number of buckets") { + test("create table using as select - with overridden max number of buckets") { def createTableSql(numBuckets: Int): String = s""" |CREATE TABLE t USING PARQUET diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala index ca3e714665818..0da6b487e31ee 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala @@ -359,7 +359,7 @@ class TableScanSuite extends DataSourceTest with SharedSparkSession { val schemaNotMatch = intercept[Exception] { sql( s""" - |CREATE $tableType relationProvierWithSchema (i int) + |CREATE $tableType relationProviderWithSchema (i int) |USING org.apache.spark.sql.sources.SimpleScanSource |OPTIONS ( | From '1', @@ -373,7 +373,7 @@ class TableScanSuite extends DataSourceTest with SharedSparkSession { val schemaNeeded = intercept[Exception] { sql( s""" - |CREATE $tableType schemaRelationProvierWithoutSchema + |CREATE $tableType schemaRelationProviderWithoutSchema |USING org.apache.spark.sql.sources.AllDataTypesScanSource |OPTIONS ( | From '1', @@ -387,7 +387,7 @@ class TableScanSuite extends DataSourceTest with SharedSparkSession { test("read the data source tables that do not extend SchemaRelationProvider") { Seq("TEMPORARY VIEW", "TABLE").foreach { tableType => - val tableName = "relationProvierWithSchema" + val tableName = "relationProviderWithSchema" withTable (tableName) { sql( s""" diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala index 3c74e316f260e..b240d2058a018 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala @@ -1946,9 +1946,9 @@ class FileStreamSourceSuite extends FileStreamSourceTest { test("SourceFileArchiver - fail when base archive path matches source pattern") { val fakeFileSystem = new FakeFileSystem("fake") - def assertThrowIllegalArgumentException(sourcePatttern: Path, baseArchivePath: Path): Unit = { + def assertThrowIllegalArgumentException(sourcePattern: Path, baseArchivePath: Path): Unit = { intercept[IllegalArgumentException] { - new SourceFileArchiver(fakeFileSystem, sourcePatttern, fakeFileSystem, baseArchivePath) + new SourceFileArchiver(fakeFileSystem, sourcePattern, fakeFileSystem, baseArchivePath) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala index e64d5f6f3587e..ed284df10aced 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala @@ -1064,13 +1064,13 @@ class StreamSuite extends StreamTest { } test("SPARK-30657: streaming limit should not apply on limits on state subplans") { - val streanData = MemoryStream[Int] - val streamingDF = streanData.toDF().toDF("value") + val streamData = MemoryStream[Int] + val streamingDF = streamData.toDF().toDF("value") val staticDF = spark.createDataset(Seq(1)).toDF("value").orderBy("value") testStream(streamingDF.join(staticDF.limit(1), "value"))( - AddData(streanData, 1, 2, 3), + AddData(streamData, 1, 2, 3), CheckAnswer(Row(1)), - AddData(streanData, 1, 3, 5), + AddData(streamData, 1, 3, 5), CheckAnswer(Row(1), Row(1))) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamTableAPISuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamTableAPISuite.scala index 0296366f3578b..9cf649605ed1c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamTableAPISuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamTableAPISuite.scala @@ -107,12 +107,12 @@ class DataStreamTableAPISuite extends StreamTest with BeforeAndAfter { } test("read: read table without streaming capability support") { - val tableIdentifer = "testcat.table_name" + val tableIdentifier = "testcat.table_name" - spark.sql(s"CREATE TABLE $tableIdentifer (id bigint, data string) USING foo") + spark.sql(s"CREATE TABLE $tableIdentifier (id bigint, data string) USING foo") intercept[AnalysisException] { - spark.readStream.table(tableIdentifer) + spark.readStream.table(tableIdentifier) }.message.contains("does not support either micro-batch or continuous scan") } @@ -213,7 +213,7 @@ class DataStreamTableAPISuite extends StreamTest with BeforeAndAfter { } test("write: write to non-exist table with custom catalog") { - val tableIdentifier = "testcat.nonexisttable" + val tableIdentifier = "testcat.nonexistenttable" withTable(tableIdentifier) { runTestWithStreamAppend(tableIdentifier) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala index c51faaf10f5dd..a1fd4a0215b1f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala @@ -169,10 +169,10 @@ private[sql] trait SQLTestData { self => rdd } - protected lazy val calenderIntervalData: RDD[IntervalData] = { + protected lazy val calendarIntervalData: RDD[IntervalData] = { val rdd = spark.sparkContext.parallelize( IntervalData(new CalendarInterval(1, 1, 1)) :: Nil) - rdd.toDF().createOrReplaceTempView("calenderIntervalData") + rdd.toDF().createOrReplaceTempView("calendarIntervalData") rdd } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala index cfc92a780308d..ed2e309fa075a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSparkSession.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} trait SharedSparkSession extends SQLTestUtils with SharedSparkSessionBase { /** - * Suites extending [[SharedSparkSession]] are sharing resources (eg. SparkSession) in their + * Suites extending [[SharedSparkSession]] are sharing resources (e.g. SparkSession) in their * tests. That trait initializes the spark session in its [[beforeAll()]] implementation before * the automatic thread snapshot is performed, so the audit code could fail to report threads * leaked by that shared session. From c001dd49e4e9bb42f18618afe710e401b2df3afb Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Tue, 8 Dec 2020 10:43:41 -0800 Subject: [PATCH 146/150] [SPARK-33675][INFRA][FOLLOWUP] Schedule branch-3.1 snapshot at master branch ### What changes were proposed in this pull request? Currently, `master`/`branch-3.0`/`branch-2.4` snapshot publishing is successfully migrated from Jenkins to `GitHub Action`. - https://github.com/apache/spark/actions?query=workflow%3A%22Publish+Snapshot%22 This PR aims to schedule `branch-3.1` snapshot at `master` branch. ### Why are the changes needed? This is because it turns out that `GitHub Action Schedule` works only at `master` branch. (the default branch). - https://docs.github.com/en/free-pro-teamlatest/actions/reference/events-that-trigger-workflows#scheduled-events ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? The matrix triggering is tested at the forked branch. - https://github.com/dongjoon-hyun/spark/runs/1519015974 Closes #30674 from dongjoon-hyun/SPARK-SCHEDULE-3.1. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun --- .github/workflows/publish_snapshot.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/publish_snapshot.yml b/.github/workflows/publish_snapshot.yml index 9871680f73891..504d702fd1f22 100644 --- a/.github/workflows/publish_snapshot.yml +++ b/.github/workflows/publish_snapshot.yml @@ -7,9 +7,17 @@ on: jobs: publish-snapshot: runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + branch: + - master + - branch-3.1 steps: - name: Checkout Spark repository uses: actions/checkout@master + with: + ref: ${{ matrix.branch }} - name: Cache Maven local repository uses: actions/cache@v2 with: @@ -27,4 +35,5 @@ jobs: ASF_PASSWORD: ${{ secrets.NEXUS_PW }} GPG_KEY: "not_used" GPG_PASSPHRASE: "not_used" + GIT_REF: ${{ matrix.branch }} run: ./dev/create-release/release-build.sh publish-snapshot From 6fd234503cf1e85715ccd3bda42f29dae1daa71b Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Tue, 8 Dec 2020 11:41:35 -0800 Subject: [PATCH 147/150] [SPARK-32110][SQL] normalize special floating numbers in HyperLogLog++ ### What changes were proposed in this pull request? Currently, Spark treats 0.0 and -0.0 semantically equal, while it still retains the difference between them so that users can see -0.0 when displaying the data set. The comparison expressions in Spark take care of the special floating numbers and implement the correct semantic. However, Spark doesn't always use these comparison expressions to compare values, and we need to normalize the special floating numbers before comparing them in these places: 1. GROUP BY 2. join keys 3. window partition keys This PR fixes one more place that compares values without using comparison expressions: HyperLogLog++ ### Why are the changes needed? Fix the query result ### Does this PR introduce _any_ user-facing change? Yes, the result of HyperLogLog++ becomes correct now. ### How was this patch tested? a new test case, and a few more test cases that pass before this PR to improve test coverage. Closes #30673 from cloud-fan/bug. Authored-by: Wenchen Fan Signed-off-by: Dongjoon Hyun --- .../optimizer/NormalizeFloatingNumbers.scala | 45 +++++----- .../util/HyperLogLogPlusPlusHelper.scala | 8 +- .../catalyst/expressions/PredicateSuite.scala | 90 +++++++++++++++++++ .../aggregate/HyperLogLogPlusPlusSuite.scala | 24 ++++- 4 files changed, 144 insertions(+), 23 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFloatingNumbers.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFloatingNumbers.scala index 4434c29cbb3c4..ac8766cd74367 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFloatingNumbers.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFloatingNumbers.scala @@ -143,6 +143,28 @@ object NormalizeFloatingNumbers extends Rule[LogicalPlan] { case _ => throw new IllegalStateException(s"fail to normalize $expr") } + + val FLOAT_NORMALIZER: Any => Any = (input: Any) => { + val f = input.asInstanceOf[Float] + if (f.isNaN) { + Float.NaN + } else if (f == -0.0f) { + 0.0f + } else { + f + } + } + + val DOUBLE_NORMALIZER: Any => Any = (input: Any) => { + val d = input.asInstanceOf[Double] + if (d.isNaN) { + Double.NaN + } else if (d == -0.0d) { + 0.0d + } else { + d + } + } } case class NormalizeNaNAndZero(child: Expression) extends UnaryExpression with ExpectsInputTypes { @@ -152,27 +174,8 @@ case class NormalizeNaNAndZero(child: Expression) extends UnaryExpression with E override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(FloatType, DoubleType)) private lazy val normalizer: Any => Any = child.dataType match { - case FloatType => (input: Any) => { - val f = input.asInstanceOf[Float] - if (f.isNaN) { - Float.NaN - } else if (f == -0.0f) { - 0.0f - } else { - f - } - } - - case DoubleType => (input: Any) => { - val d = input.asInstanceOf[Double] - if (d.isNaN) { - Double.NaN - } else if (d == -0.0d) { - 0.0d - } else { - d - } - } + case FloatType => NormalizeFloatingNumbers.FLOAT_NORMALIZER + case DoubleType => NormalizeFloatingNumbers.DOUBLE_NORMALIZER } override def nullSafeEval(input: Any): Any = { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/HyperLogLogPlusPlusHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/HyperLogLogPlusPlusHelper.scala index ea619c6a7666c..6471a746f2edf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/HyperLogLogPlusPlusHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/HyperLogLogPlusPlusHelper.scala @@ -22,6 +22,7 @@ import java.util import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.XxHash64Function +import org.apache.spark.sql.catalyst.optimizer.NormalizeFloatingNumbers.{DOUBLE_NORMALIZER, FLOAT_NORMALIZER} import org.apache.spark.sql.types._ // A helper class for HyperLogLogPlusPlus. @@ -88,7 +89,12 @@ class HyperLogLogPlusPlusHelper(relativeSD: Double) extends Serializable { * * Variable names in the HLL++ paper match variable names in the code. */ - def update(buffer: InternalRow, bufferOffset: Int, value: Any, dataType: DataType): Unit = { + def update(buffer: InternalRow, bufferOffset: Int, _value: Any, dataType: DataType): Unit = { + val value = dataType match { + case FloatType => FLOAT_NORMALIZER.apply(_value) + case DoubleType => DOUBLE_NORMALIZER.apply(_value) + case _ => _value + } // Create the hashed value 'x'. val x = XxHash64Function.hash(value, dataType, 42L) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala index a36baec1a0b99..6f75623dc59ae 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala @@ -554,4 +554,94 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(GreaterThan(Literal(Float.NaN), Literal(Float.NaN)), false) checkEvaluation(GreaterThan(Literal(0.0F), Literal(-0.0F)), false) } + + test("SPARK-32110: compare special double/float values in array") { + def createUnsafeDoubleArray(d: Double): Literal = { + Literal(UnsafeArrayData.fromPrimitiveArray(Array(d)), ArrayType(DoubleType)) + } + def createSafeDoubleArray(d: Double): Literal = { + Literal(new GenericArrayData(Array(d)), ArrayType(DoubleType)) + } + def createUnsafeFloatArray(d: Double): Literal = { + Literal(UnsafeArrayData.fromPrimitiveArray(Array(d.toFloat)), ArrayType(FloatType)) + } + def createSafeFloatArray(d: Double): Literal = { + Literal(new GenericArrayData(Array(d.toFloat)), ArrayType(FloatType)) + } + def checkExpr( + exprBuilder: (Expression, Expression) => Expression, + left: Double, + right: Double, + expected: Any): Unit = { + // test double + checkEvaluation( + exprBuilder(createUnsafeDoubleArray(left), createUnsafeDoubleArray(right)), expected) + checkEvaluation( + exprBuilder(createUnsafeDoubleArray(left), createSafeDoubleArray(right)), expected) + checkEvaluation( + exprBuilder(createSafeDoubleArray(left), createSafeDoubleArray(right)), expected) + // test float + checkEvaluation( + exprBuilder(createUnsafeFloatArray(left), createUnsafeFloatArray(right)), expected) + checkEvaluation( + exprBuilder(createUnsafeFloatArray(left), createSafeFloatArray(right)), expected) + checkEvaluation( + exprBuilder(createSafeFloatArray(left), createSafeFloatArray(right)), expected) + } + + checkExpr(EqualTo, Double.NaN, Double.NaN, true) + checkExpr(EqualTo, Double.NaN, Double.PositiveInfinity, false) + checkExpr(EqualTo, 0.0, -0.0, true) + checkExpr(GreaterThan, Double.NaN, Double.PositiveInfinity, true) + checkExpr(GreaterThan, Double.NaN, Double.NaN, false) + checkExpr(GreaterThan, 0.0, -0.0, false) + } + + test("SPARK-32110: compare special double/float values in struct") { + def createUnsafeDoubleRow(d: Double): Literal = { + val dt = new StructType().add("d", "double") + val converter = UnsafeProjection.create(dt) + val unsafeRow = converter.apply(InternalRow(d)) + Literal(unsafeRow, dt) + } + def createSafeDoubleRow(d: Double): Literal = { + Literal(InternalRow(d), new StructType().add("d", "double")) + } + def createUnsafeFloatRow(d: Double): Literal = { + val dt = new StructType().add("f", "float") + val converter = UnsafeProjection.create(dt) + val unsafeRow = converter.apply(InternalRow(d.toFloat)) + Literal(unsafeRow, dt) + } + def createSafeFloatRow(d: Double): Literal = { + Literal(InternalRow(d.toFloat), new StructType().add("f", "float")) + } + def checkExpr( + exprBuilder: (Expression, Expression) => Expression, + left: Double, + right: Double, + expected: Any): Unit = { + // test double + checkEvaluation( + exprBuilder(createUnsafeDoubleRow(left), createUnsafeDoubleRow(right)), expected) + checkEvaluation( + exprBuilder(createUnsafeDoubleRow(left), createSafeDoubleRow(right)), expected) + checkEvaluation( + exprBuilder(createSafeDoubleRow(left), createSafeDoubleRow(right)), expected) + // test float + checkEvaluation( + exprBuilder(createUnsafeFloatRow(left), createUnsafeFloatRow(right)), expected) + checkEvaluation( + exprBuilder(createUnsafeFloatRow(left), createSafeFloatRow(right)), expected) + checkEvaluation( + exprBuilder(createSafeFloatRow(left), createSafeFloatRow(right)), expected) + } + + checkExpr(EqualTo, Double.NaN, Double.NaN, true) + checkExpr(EqualTo, Double.NaN, Double.PositiveInfinity, false) + checkExpr(EqualTo, 0.0, -0.0, true) + checkExpr(GreaterThan, Double.NaN, Double.PositiveInfinity, true) + checkExpr(GreaterThan, Double.NaN, Double.NaN, false) + checkExpr(GreaterThan, 0.0, -0.0, false) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala index 98fd04c9cca91..1afccea5aef15 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.catalyst.expressions.aggregate +import java.lang.{Double => JDouble} import java.util.Random import scala.collection.mutable @@ -24,7 +25,7 @@ import scala.collection.mutable import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{BoundReference, SpecificInternalRow} -import org.apache.spark.sql.types.{DataType, IntegerType} +import org.apache.spark.sql.types.{DataType, DoubleType, IntegerType} class HyperLogLogPlusPlusSuite extends SparkFunSuite { @@ -153,4 +154,25 @@ class HyperLogLogPlusPlusSuite extends SparkFunSuite { // Check if the buffers are equal. assert(buffer2 == buffer1a, "Buffers should be equal") } + + test("SPARK-32110: add 0.0 and -0.0") { + val (hll, input, buffer) = createEstimator(0.05, DoubleType) + input.setDouble(0, 0.0) + hll.update(buffer, input) + input.setDouble(0, -0.0) + hll.update(buffer, input) + evaluateEstimate(hll, buffer, 1); + } + + test("SPARK-32110: add NaN") { + val (hll, input, buffer) = createEstimator(0.05, DoubleType) + input.setDouble(0, Double.NaN) + hll.update(buffer, input) + val specialNaN = JDouble.longBitsToDouble(0x7ff1234512345678L) + assert(JDouble.isNaN(specialNaN)) + assert(JDouble.doubleToRawLongBits(Double.NaN) != JDouble.doubleToRawLongBits(specialNaN)) + input.setDouble(0, specialNaN) + hll.update(buffer, input) + evaluateEstimate(hll, buffer, 1); + } } From 3ac70f169d653f22bd04ec7bb6ebb49696807bb2 Mon Sep 17 00:00:00 2001 From: Nicholas Marion Date: Tue, 8 Dec 2020 12:11:06 -0800 Subject: [PATCH 148/150] [SPARK-33695][BUILD] Upgrade to jackson to 2.10.5 and jackson-databind to 2.10.5.1 ### What changes were proposed in this pull request? Upgrade the jackson dependencies to 2.10.5 and jackson-databind to 2.10.5.1 ### Why are the changes needed? Jackson dependency has vulnerability CVE-2020-25649. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing unit tests. Closes #30656 from n-marion/SPARK-33695_upgrade-jackson. Authored-by: Nicholas Marion Signed-off-by: Dongjoon Hyun --- dev/deps/spark-deps-hadoop-2.7-hive-2.3 | 16 ++++++++-------- dev/deps/spark-deps-hadoop-3.2-hive-2.3 | 16 ++++++++-------- pom.xml | 5 +++-- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 index 401050a60e493..3a54dbd6232e3 100644 --- a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 @@ -103,17 +103,17 @@ httpclient/4.5.13//httpclient-4.5.13.jar httpcore/4.4.12//httpcore-4.4.12.jar istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar ivy/2.4.0//ivy-2.4.0.jar -jackson-annotations/2.10.0//jackson-annotations-2.10.0.jar +jackson-annotations/2.10.5//jackson-annotations-2.10.5.jar jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar -jackson-core/2.10.0//jackson-core-2.10.0.jar -jackson-databind/2.10.0//jackson-databind-2.10.0.jar -jackson-dataformat-yaml/2.10.0//jackson-dataformat-yaml-2.10.0.jar +jackson-core/2.10.5//jackson-core-2.10.5.jar +jackson-databind/2.10.5.1//jackson-databind-2.10.5.1.jar +jackson-dataformat-yaml/2.10.5//jackson-dataformat-yaml-2.10.5.jar jackson-datatype-jsr310/2.11.2//jackson-datatype-jsr310-2.11.2.jar jackson-jaxrs/1.9.13//jackson-jaxrs-1.9.13.jar jackson-mapper-asl/1.9.13//jackson-mapper-asl-1.9.13.jar -jackson-module-jaxb-annotations/2.10.0//jackson-module-jaxb-annotations-2.10.0.jar -jackson-module-paranamer/2.10.0//jackson-module-paranamer-2.10.0.jar -jackson-module-scala_2.12/2.10.0//jackson-module-scala_2.12-2.10.0.jar +jackson-module-jaxb-annotations/2.10.5//jackson-module-jaxb-annotations-2.10.5.jar +jackson-module-paranamer/2.10.5//jackson-module-paranamer-2.10.5.jar +jackson-module-scala_2.12/2.10.5//jackson-module-scala_2.12-2.10.5.jar jackson-xc/1.9.13//jackson-xc-1.9.13.jar jakarta.activation-api/1.2.1//jakarta.activation-api-1.2.1.jar jakarta.annotation-api/1.3.5//jakarta.annotation-api-1.3.5.jar @@ -220,7 +220,7 @@ shapeless_2.12/2.3.3//shapeless_2.12-2.3.3.jar shims/0.9.0//shims-0.9.0.jar slf4j-api/1.7.30//slf4j-api-1.7.30.jar slf4j-log4j12/1.7.30//slf4j-log4j12-1.7.30.jar -snakeyaml/1.24//snakeyaml-1.24.jar +snakeyaml/1.26//snakeyaml-1.26.jar snappy-java/1.1.8//snappy-java-1.1.8.jar spire-macros_2.12/0.17.0-M1//spire-macros_2.12-0.17.0-M1.jar spire-platform_2.12/0.17.0-M1//spire-platform_2.12-0.17.0-M1.jar diff --git a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 index b0f8935843281..67bcc7a8ed902 100644 --- a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 @@ -102,18 +102,18 @@ httpclient/4.5.13//httpclient-4.5.13.jar httpcore/4.4.12//httpcore-4.4.12.jar istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar ivy/2.4.0//ivy-2.4.0.jar -jackson-annotations/2.10.0//jackson-annotations-2.10.0.jar +jackson-annotations/2.10.5//jackson-annotations-2.10.5.jar jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar -jackson-core/2.10.0//jackson-core-2.10.0.jar -jackson-databind/2.10.0//jackson-databind-2.10.0.jar -jackson-dataformat-yaml/2.10.0//jackson-dataformat-yaml-2.10.0.jar +jackson-core/2.10.5//jackson-core-2.10.5.jar +jackson-databind/2.10.5.1//jackson-databind-2.10.5.1.jar +jackson-dataformat-yaml/2.10.5//jackson-dataformat-yaml-2.10.5.jar jackson-datatype-jsr310/2.11.2//jackson-datatype-jsr310-2.11.2.jar jackson-jaxrs-base/2.9.5//jackson-jaxrs-base-2.9.5.jar jackson-jaxrs-json-provider/2.9.5//jackson-jaxrs-json-provider-2.9.5.jar jackson-mapper-asl/1.9.13//jackson-mapper-asl-1.9.13.jar -jackson-module-jaxb-annotations/2.10.0//jackson-module-jaxb-annotations-2.10.0.jar -jackson-module-paranamer/2.10.0//jackson-module-paranamer-2.10.0.jar -jackson-module-scala_2.12/2.10.0//jackson-module-scala_2.12-2.10.0.jar +jackson-module-jaxb-annotations/2.10.5//jackson-module-jaxb-annotations-2.10.5.jar +jackson-module-paranamer/2.10.5//jackson-module-paranamer-2.10.5.jar +jackson-module-scala_2.12/2.10.5//jackson-module-scala_2.12-2.10.5.jar jakarta.activation-api/1.2.1//jakarta.activation-api-1.2.1.jar jakarta.annotation-api/1.3.5//jakarta.annotation-api-1.3.5.jar jakarta.inject/2.6.1//jakarta.inject-2.6.1.jar @@ -235,7 +235,7 @@ shapeless_2.12/2.3.3//shapeless_2.12-2.3.3.jar shims/0.9.0//shims-0.9.0.jar slf4j-api/1.7.30//slf4j-api-1.7.30.jar slf4j-log4j12/1.7.30//slf4j-log4j12-1.7.30.jar -snakeyaml/1.24//snakeyaml-1.24.jar +snakeyaml/1.26//snakeyaml-1.26.jar snappy-java/1.1.8//snappy-java-1.1.8.jar spire-macros_2.12/0.17.0-M1//spire-macros_2.12-0.17.0-M1.jar spire-platform_2.12/0.17.0-M1//spire-platform_2.12-0.17.0-M1.jar diff --git a/pom.xml b/pom.xml index 364dec688b38b..23eb16a7db472 100644 --- a/pom.xml +++ b/pom.xml @@ -169,7 +169,8 @@ true 1.9.13 - 2.10.0 + 2.10.5 + 2.10.5.1 1.1.8 1.1.2 1.10 @@ -773,7 +774,7 @@ com.fasterxml.jackson.core jackson-databind - ${fasterxml.jackson.version} + ${fasterxml.jackson-databind.version} com.fasterxml.jackson.core From f021f6d3c72e1c84637798b4ddcb7e208fdfbf46 Mon Sep 17 00:00:00 2001 From: Weichen Xu Date: Wed, 9 Dec 2020 11:18:09 +0800 Subject: [PATCH 149/150] [MINOR][ML] Increase Bounded MLOR (without regularization) test error tolerance ### What changes were proposed in this pull request? Improve LogisticRegression test error tolerance ### Why are the changes needed? When we switch BLAS version, some of the tests will fail due to too strict error tolerance in test. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? N/A Closes #30587 from WeichenXu123/fix_lor_test. Authored-by: Weichen Xu Signed-off-by: Weichen Xu --- .../LogisticRegressionSuite.scala | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index d0b282db1ece8..d2814b420e017 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -1548,9 +1548,9 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { val interceptsExpected1 = Vectors.dense( 1.0000152482448372, 3.591773288423673, 5.079685953744937) - checkCoefficientsEquivalent(model1.coefficientMatrix, coefficientsExpected1) + checkBoundedMLORCoefficientsEquivalent(model1.coefficientMatrix, coefficientsExpected1) assert(model1.interceptVector ~== interceptsExpected1 relTol 0.01) - checkCoefficientsEquivalent(model2.coefficientMatrix, coefficientsExpected1) + checkBoundedMLORCoefficientsEquivalent(model2.coefficientMatrix, coefficientsExpected1) assert(model2.interceptVector ~== interceptsExpected1 relTol 0.01) // Bound constrained optimization with bound on both side. @@ -1585,9 +1585,9 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { isTransposed = true) val interceptsExpected3 = Vectors.dense(1.0, 2.0, 2.0) - checkCoefficientsEquivalent(model3.coefficientMatrix, coefficientsExpected3) + checkBoundedMLORCoefficientsEquivalent(model3.coefficientMatrix, coefficientsExpected3) assert(model3.interceptVector ~== interceptsExpected3 relTol 0.01) - checkCoefficientsEquivalent(model4.coefficientMatrix, coefficientsExpected3) + checkBoundedMLORCoefficientsEquivalent(model4.coefficientMatrix, coefficientsExpected3) assert(model4.interceptVector ~== interceptsExpected3 relTol 0.01) // Bound constrained optimization with infinite bound on both side. @@ -1621,9 +1621,9 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { val interceptsExpected5 = Vectors.dense( -2.2231282183460723, 0.3669496747012527, 1.856178543644802) - checkCoefficientsEquivalent(model5.coefficientMatrix, coefficientsExpected5) + checkBoundedMLORCoefficientsEquivalent(model5.coefficientMatrix, coefficientsExpected5) assert(model5.interceptVector ~== interceptsExpected5 relTol 0.01) - checkCoefficientsEquivalent(model6.coefficientMatrix, coefficientsExpected5) + checkBoundedMLORCoefficientsEquivalent(model6.coefficientMatrix, coefficientsExpected5) assert(model6.interceptVector ~== interceptsExpected5 relTol 0.01) } @@ -1719,9 +1719,9 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { 1.7524631428961193, 1.2292565990448736, 1.3433784431904323, 1.5846063017678864), isTransposed = true) - checkCoefficientsEquivalent(model1.coefficientMatrix, coefficientsExpected) + checkBoundedMLORCoefficientsEquivalent(model1.coefficientMatrix, coefficientsExpected) assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) - checkCoefficientsEquivalent(model2.coefficientMatrix, coefficientsExpected) + checkBoundedMLORCoefficientsEquivalent(model2.coefficientMatrix, coefficientsExpected) assert(model2.interceptVector.toArray === Array.fill(3)(0.0)) } @@ -2953,16 +2953,17 @@ object LogisticRegressionSuite { } /** + * Note: This method is only used in Bounded MLOR (without regularization) test * When no regularization is applied, the multinomial coefficients lack identifiability * because we do not use a pivot class. We can add any constant value to the coefficients * and get the same likelihood. If fitting under bound constrained optimization, we don't * choose the mean centered coefficients like what we do for unbound problems, since they * may out of the bounds. We use this function to check whether two coefficients are equivalent. */ - def checkCoefficientsEquivalent(coefficients1: Matrix, coefficients2: Matrix): Unit = { + def checkBoundedMLORCoefficientsEquivalent(coefficients1: Matrix, coefficients2: Matrix): Unit = { coefficients1.colIter.zip(coefficients2.colIter).foreach { case (col1: Vector, col2: Vector) => (col1.asBreeze - col2.asBreeze).toArray.toSeq.sliding(2).foreach { - case Seq(v1, v2) => assert(v1 ~= v2 absTol 1E-3) + case Seq(v1, v2) => assert(v1 ~= v2 absTol 1E-2) } } } From 29fed23ba16d580e6247b6e70e9c9eef0698aa95 Mon Sep 17 00:00:00 2001 From: Terry Kim Date: Wed, 9 Dec 2020 05:06:37 +0000 Subject: [PATCH 150/150] [SPARK-33703][SQL] Migrate MSCK REPAIR TABLE to use UnresolvedTable to resolve the identifier ### What changes were proposed in this pull request? This PR proposes to migrate `MSCK REPAIR TABLE` to use `UnresolvedTable` to resolve the table identifier. This allows consistent resolution rules (temp view first, etc.) to be applied for both v1/v2 commands. More info about the consistent resolution rule proposal can be found in [JIRA](https://issues.apache.org/jira/browse/SPARK-29900) or [proposal doc](https://docs.google.com/document/d/1hvLjGA8y_W_hhilpngXVub1Ebv8RsMap986nENCFnrg/edit?usp=sharing). Note that `MSCK REPAIR TABLE` is not supported for v2 tables. ### Why are the changes needed? The PR makes the resolution consistent behavior consistent. For example, ```scala sql("CREATE DATABASE test") sql("CREATE TABLE spark_catalog.test.t (id bigint, val string) USING csv PARTITIONED BY (id)") sql("CREATE TEMPORARY VIEW t AS SELECT 2") sql("USE spark_catalog.test") sql("MSCK REPAIR TABLE t") // works fine ``` , but after this PR: ``` sql("MSCK REPAIR TABLE t") org.apache.spark.sql.AnalysisException: t is a temp view. 'MSCK REPAIR TABLE' expects a table; line 1 pos 0 ``` , which is the consistent behavior with other commands. ### Does this PR introduce _any_ user-facing change? After this PR, `MSCK REPAIR TABLE t` in the above example is resolved to a temp view `t` first instead of `spark_catalog.test.t`. ### How was this patch tested? Updated existing tests. Closes #30664 from imback82/repair_table_V2. Authored-by: Terry Kim Signed-off-by: Wenchen Fan --- .../apache/spark/sql/catalyst/parser/AstBuilder.scala | 5 +++-- .../spark/sql/catalyst/plans/logical/statements.scala | 5 ----- .../spark/sql/catalyst/plans/logical/v2Commands.scala | 7 +++++++ .../spark/sql/catalyst/parser/DDLParserSuite.scala | 2 +- .../sql/catalyst/analysis/ResolveSessionCatalog.scala | 7 ++----- .../execution/datasources/v2/DataSourceV2Strategy.scala | 3 +++ .../spark/sql/connector/DataSourceV2SQLSuite.scala | 9 +-------- 7 files changed, 17 insertions(+), 21 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 89b81ec1d83aa..7787e199d3770 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -3547,7 +3547,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Create a [[RepairTableStatement]]. + * Create a [[RepairTable]]. * * For example: * {{{ @@ -3555,7 +3555,8 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg * }}} */ override def visitRepairTable(ctx: RepairTableContext): LogicalPlan = withOrigin(ctx) { - RepairTableStatement(visitMultipartIdentifier(ctx.multipartIdentifier())) + RepairTable( + UnresolvedTable(visitMultipartIdentifier(ctx.multipartIdentifier()), "MSCK REPAIR TABLE")) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala index c4ac8ea8f2e69..b731b8a2fd8fd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statements.scala @@ -383,11 +383,6 @@ case class CreateNamespaceStatement( */ case class UseStatement(isNamespaceSet: Boolean, nameParts: Seq[String]) extends ParsedStatement -/** - * A REPAIR TABLE statement, as parsed from SQL - */ -case class RepairTableStatement(tableName: Seq[String]) extends ParsedStatement - /** * A TRUNCATE TABLE statement, as parsed from SQL */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala index 1e17c51137a55..e014048f723f5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/v2Commands.scala @@ -735,3 +735,10 @@ case class DropView( ifExists: Boolean) extends Command { override def children: Seq[LogicalPlan] = child :: Nil } + +/** + * The logical plan of the MSCK REPAIR TABLE command. + */ +case class RepairTable(child: LogicalPlan) extends Command { + override def children: Seq[LogicalPlan] = child :: Nil +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala index d5b27d9ad25cf..947154eae12c8 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala @@ -1952,7 +1952,7 @@ class DDLParserSuite extends AnalysisTest { test("MSCK REPAIR TABLE") { comparePlans( parsePlan("MSCK REPAIR TABLE a.b.c"), - RepairTableStatement(Seq("a", "b", "c"))) + RepairTable(UnresolvedTable(Seq("a", "b", "c"), "MSCK REPAIR TABLE"))) } test("LOAD DATA INTO table") { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index 657764832a931..817a63aa9aa6e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -408,11 +408,8 @@ class ResolveSessionCatalog( case AnalyzeColumn(ResolvedV1TableOrViewIdentifier(ident), columnNames, allColumns) => AnalyzeColumnCommand(ident.asTableIdentifier, columnNames, allColumns) - case RepairTableStatement(tbl) => - val v1TableName = parseV1Table(tbl, "MSCK REPAIR TABLE") - AlterTableRecoverPartitionsCommand( - v1TableName.asTableIdentifier, - "MSCK REPAIR TABLE") + case RepairTable(ResolvedV1TableIdentifier(ident)) => + AlterTableRecoverPartitionsCommand(ident.asTableIdentifier, "MSCK REPAIR TABLE") case LoadData(ResolvedV1TableIdentifier(ident), path, isLocal, isOverwrite, partition) => LoadDataCommand( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index 7e2a485dcb4cc..37a4dcf081be4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -341,6 +341,9 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat table, pattern.map(_.asInstanceOf[ResolvedPartitionSpec])) :: Nil + case RepairTable(_: ResolvedTable) => + throw new AnalysisException("MSCK REPAIR TABLE is not supported for v2 tables.") + case _ => Nil } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index b1d61658b8a8b..9020065449cef 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -2011,7 +2011,7 @@ class DataSourceV2SQLSuite val t = "testcat.ns1.ns2.tbl" withTable(t) { spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo") - testV1Command("MSCK REPAIR TABLE", t) + testNotSupportedV2Command("MSCK REPAIR TABLE", t) } } @@ -2612,13 +2612,6 @@ class DataSourceV2SQLSuite assert(e.message.contains(s"$cmdStr is not supported for v2 tables")) } - private def testV1Command(sqlCommand: String, sqlParams: String): Unit = { - val e = intercept[AnalysisException] { - sql(s"$sqlCommand $sqlParams") - } - assert(e.message.contains(s"$sqlCommand is only supported with v1 tables")) - } - private def assertAnalysisError(sqlStatement: String, expectedError: String): Unit = { val errMsg = intercept[AnalysisException] { sql(sqlStatement)