From 1770c565aa573e6f32e404b4f775f2c12edcae2e Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Fri, 27 Nov 2020 10:52:36 +0800 Subject: [PATCH] add comments to DataStreamReader.scala, readwriter.py and streaming.py --- python/pyspark/sql/readwriter.py | 21 +++++++++++++++++++ python/pyspark/sql/streaming.py | 21 +++++++++++++++++++ .../sql/streaming/DataStreamReader.scala | 21 +++++++++++++++++++ 3 files changed, 63 insertions(+) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index bb31e6a3e09f8..b492198f2959c 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -259,6 +259,27 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None, allows accepting quoting of all character using backslash quoting mechanism. If None is set, it uses the default value, ``false``. + unescapedQuoteHandling : str, optional + defines how the CsvParser will handle values with unescaped quotes. If None is + set, it uses the default value, ``STOP_AT_DELIMITER``. + + * ``STOP_AT_CLOSING_QUOTE``: If unescaped quotes are found in the input, accumulate \ + the quote character and proceed parsing the value as a quoted value, until a closing \ + quote is found. + * ``BACK_TO_DELIMITER``: If unescaped quotes are found in the input, consider the value \ + as an unquoted value. This will make the parser accumulate all characters of the current \ + parsed value until the delimiter is found. If no delimiter is found in the value, the \ + parser will continue accumulating characters from the input until a delimiter or line \ + ending is found. + * ``STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value \ + as an unquoted value. This will make the parser accumulate all characters until the \ + delimiter or a line ending is found in the input. + * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed \ + for the given value will be skipped and the value set in nullValue will be produced \ + instead. + * ``RAISE_ERROR``: If unescaped quotes are found in the input, a TextParsingException \ + will be thrown. + mode : str, optional allows a mode for dealing with corrupt records during parsing. If None is set, it uses the default value, ``PERMISSIVE``. diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index e7b2fa16d620a..63fcbd52fd366 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -851,6 +851,27 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non maxMalformedLogPerPartition : str or int, optional this parameter is no longer used since Spark 2.2.0. If specified, it is ignored. + unescapedQuoteHandling : str, optional + defines how the CsvParser will handle values with unescaped quotes. If None is + set, it uses the default value, ``STOP_AT_DELIMITER``. + + * ``STOP_AT_CLOSING_QUOTE``: If unescaped quotes are found in the input, accumulate \ + the quote character and proceed parsing the value as a quoted value, until a closing \ + quote is found. + * ``BACK_TO_DELIMITER``: If unescaped quotes are found in the input, consider the value \ + as an unquoted value. This will make the parser accumulate all characters of the current \ + parsed value until the delimiter is found. If no delimiter is found in the value, the \ + parser will continue accumulating characters from the input until a delimiter or line \ + ending is found. + * ``STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value \ + as an unquoted value. This will make the parser accumulate all characters until the \ + delimiter or a line ending is found in the input. + * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed \ + for the given value will be skipped and the value set in nullValue will be produced \ + instead. + * ``RAISE_ERROR``: If unescaped quotes are found in the input, a TextParsingException \ + will be thrown. + mode : str, optional allows a mode for dealing with corrupt records during parsing. If None is set, it uses the default value, ``PERMISSIVE``. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala index 9bc4acd49a980..7f4ef8be562fb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala @@ -396,6 +396,27 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * a record can have. *
  • `maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed * for any given value being read. By default, it is -1 meaning unlimited length
  • + *
  • `unescapedQuoteHandling` (default `STOP_AT_DELIMITER`): defines how the CsvParser + * will handle values with unescaped quotes. + * + *
  • *
  • `mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records * during parsing. It supports the following case-insensitive modes. *