add comments to DataStreamReader.scala, readwriter.py and streaming.py

apache · Nov 27, 2020 · 1770c56 · 1770c56
1 parent b025271
commit 1770c56
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 0 deletions.
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -259,6 +259,27 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
             allows accepting quoting of all character
             using backslash quoting mechanism. If None is
             set, it uses the default value, ``false``.
+        unescapedQuoteHandling : str, optional
+            defines how the CsvParser will handle values with unescaped quotes. If None is
+            set, it uses the default value, ``STOP_AT_DELIMITER``.
+
+            * ``STOP_AT_CLOSING_QUOTE``: If unescaped quotes are found in the input, accumulate \
+              the quote character and proceed parsing the value as a quoted value, until a closing \
+              quote is found.
+            * ``BACK_TO_DELIMITER``: If unescaped quotes are found in the input, consider the value \
+              as an unquoted value. This will make the parser accumulate all characters of the current \
+              parsed value until the delimiter is found. If no delimiter is found in the value, the \
+              parser will continue accumulating characters from the input until a delimiter or line \
+              ending is found.
+            * ``STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value \
+              as an unquoted value. This will make the parser accumulate all characters until the \
+              delimiter or a line ending is found in the input.
+            * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed \
+              for the given value will be skipped and the value set in nullValue will be produced \
+              instead.
+            * ``RAISE_ERROR``: If unescaped quotes are found in the input, a TextParsingException \
+              will be thrown.
+
         mode : str, optional
             allows a mode for dealing with corrupt records during parsing. If None is
                      set, it uses the default value, ``PERMISSIVE``.

diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
@@ -851,6 +851,27 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
         maxMalformedLogPerPartition : str or int, optional
             this parameter is no longer used since Spark 2.2.0.
             If specified, it is ignored.
+        unescapedQuoteHandling : str, optional
+            defines how the CsvParser will handle values with unescaped quotes. If None is
+            set, it uses the default value, ``STOP_AT_DELIMITER``.
+
+            * ``STOP_AT_CLOSING_QUOTE``: If unescaped quotes are found in the input, accumulate \
+              the quote character and proceed parsing the value as a quoted value, until a closing \
+              quote is found.
+            * ``BACK_TO_DELIMITER``: If unescaped quotes are found in the input, consider the value \
+              as an unquoted value. This will make the parser accumulate all characters of the current \
+              parsed value until the delimiter is found. If no delimiter is found in the value, the \
+              parser will continue accumulating characters from the input until a delimiter or line \
+              ending is found.
+            * ``STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value \
+              as an unquoted value. This will make the parser accumulate all characters until the \
+              delimiter or a line ending is found in the input.
+            * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed \
+              for the given value will be skipped and the value set in nullValue will be produced \
+              instead.
+            * ``RAISE_ERROR``: If unescaped quotes are found in the input, a TextParsingException \
+              will be thrown.
+
         mode : str, optional
             allows a mode for dealing with corrupt records during parsing. If None is
             set, it uses the default value, ``PERMISSIVE``.

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -396,6 +396,27 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
    * a record can have.</li>
    * <li>`maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed
    * for any given value being read. By default, it is -1 meaning unlimited length</li>
+   * <li>`unescapedQuoteHandling` (default `STOP_AT_DELIMITER`): defines how the CsvParser
+   * will handle values with unescaped quotes.
+   *   <ul>
+   *     <li>`STOP_AT_CLOSING_QUOTE`: If unescaped quotes are found in the input, accumulate
+   *     the quote character and proceed parsing the value as a quoted value, until a closing
+   *     quote is found.</li>
+   *     <li>`BACK_TO_DELIMITER`: If unescaped quotes are found in the input, consider the value
+   *     as an unquoted value. This will make the parser accumulate all characters of the current
+   *     parsed value until the delimiter is found. If no delimiter is found in the value, the
+   *     parser will continue accumulating characters from the input until a delimiter or line
+   *     ending is found.</li>
+   *     <li>`STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value
+   *     as an unquoted value. This will make the parser accumulate all characters until the
+   *     delimiter or a line ending is found in the input.</li>
+   *     <li>`STOP_AT_DELIMITER`: If unescaped quotes are found in the input, the content parsed
+   *     for the given value will be skipped and the value set in nullValue will be produced
+   *     instead.</li>
+   *     <li>`RAISE_ERROR`: If unescaped quotes are found in the input, a TextParsingException
+   *     will be thrown.</li>
+   *   </ul>
+   * </li>
    * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
    *    during parsing. It supports the following case-insensitive modes.
    *   <ul>