From 1f562159bf61dd5e536db7841b16e74a635e7a97 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Tue, 13 Apr 2021 15:08:01 +0300 Subject: [PATCH] [SPARK-35045][SQL] Add an internal option to control input buffer in univocity ### What changes were proposed in this pull request? This PR makes the input buffer configurable (as an internal option). This is mainly to work around uniVocity/univocity-parsers#449. ### Why are the changes needed? To work around uniVocity/univocity-parsers#449. ### Does this PR introduce _any_ user-facing change? No, it's only internal option. ### How was this patch tested? Manually tested by modifying the unittest added in https://github.com/apache/spark/pull/31858 as below: ```diff diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index fd25a79619d..b58f0bd3661 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala -2460,6 +2460,7 abstract class CSVSuite Seq(line).toDF.write.text(path.getAbsolutePath) assert(spark.read.format("csv") .option("delimiter", "|") + .option("inputBufferSize", "128") .option("ignoreTrailingWhiteSpace", "true").load(path.getAbsolutePath).count() == 1) } } ``` Closes #32145 from HyukjinKwon/SPARK-35045. Lead-authored-by: Hyukjin Kwon Co-authored-by: HyukjinKwon Signed-off-by: Max Gekk --- .../scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index c6a80611ea7c4..2e5539a90c65d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -211,6 +211,8 @@ class CSVOptions( } val lineSeparatorInWrite: Option[String] = lineSeparator + val inputBufferSize: Option[Int] = parameters.get("inputBufferSize").map(_.toInt) + /** * The handling method to be used when unescaped quotes are found in the input. */ @@ -257,6 +259,7 @@ class CSVOptions( settings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceInRead) settings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceInRead) settings.setReadInputOnSeparateThread(false) + inputBufferSize.foreach(settings.setInputBufferSize) settings.setMaxColumns(maxColumns) settings.setNullValue(nullValue) settings.setEmptyValue(emptyValueInRead)