From b2f63f345a2b9105b2fbd952a27ff98866bebd0a Mon Sep 17 00:00:00 2001 From: JulianCologne Date: Sat, 13 Jul 2024 20:44:57 +0200 Subject: [PATCH 1/2] feat(python): add `infer_schema` parameter to `read_csv` --- py-polars/polars/io/csv/functions.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index bc73a1b677a1..2507a0ed5bf7 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -53,6 +53,7 @@ def read_csv( ignore_errors: bool = False, try_parse_dates: bool = False, n_threads: int | None = None, + infer_schema: bool = True, infer_schema_length: int | None = N_INFER_DEFAULT, batch_size: int = 8192, n_rows: int | None = None, @@ -126,7 +127,7 @@ def read_csv( Before using this option, try to increase the number of lines used for schema inference with e.g `infer_schema_length=10000` or override automatic dtype inference for specific columns with the `schema_overrides` option or use - `infer_schema_length=0` to read all columns as `pl.String` to check which + `infer_schema=False` to read all columns as `pl.String` to check which values might cause an issue. try_parse_dates Try to automatically parse dates. Most ISO8601-like formats can @@ -136,10 +137,15 @@ def read_csv( n_threads Number of threads to use in csv parsing. Defaults to the number of physical cpu's of your system. + infer_schema + When `True`, the schema is inferred from the data using the first + `infer_schema_length` rows. + When `False`, the schema is not inferred and will be `pl.String` if not + specified in `schema` or `schema_overrides`. infer_schema_length The maximum number of rows to scan for schema inference. - If set to `0`, all columns will be read as `pl.String`. If set to `None`, the full data may be scanned *(this is slow)*. + Set `infer_schema=False` to read all columns as `pl.String`. batch_size Number of lines to read into the buffer at once. Modify this to change performance. @@ -184,7 +190,7 @@ def read_csv( with windows line endings (`\r\n`), one can go with the default `\n`. The extra `\r` will be removed when processed. raise_if_empty - When there is no data in the source,`NoDataError` is raised. If this parameter + When there is no data in the source, `NoDataError` is raised. If this parameter is set to False, an empty DataFrame (with no columns) is returned instead. truncate_ragged_lines Truncate lines that are longer than the schema. @@ -410,6 +416,9 @@ def read_csv( for column_name, column_dtype in schema_overrides.items() } + if not infer_schema: + infer_schema_length = 0 + with prepare_file_arg( source, encoding=encoding, From 4cf80eac1a12c7c4df3ea723e82d0f7e2bfa1ef9 Mon Sep 17 00:00:00 2001 From: JulianCologne Date: Sun, 14 Jul 2024 10:02:47 +0200 Subject: [PATCH 2/2] add parameter for `scan_csv` and add test for `read_csv` --- py-polars/polars/io/csv/functions.py | 15 ++++++++++++--- py-polars/tests/unit/io/test_csv.py | 13 +++++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 2507a0ed5bf7..34b603174d61 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -931,6 +931,7 @@ def scan_csv( ignore_errors: bool = False, cache: bool = True, with_column_names: Callable[[list[str]], list[str]] | None = None, + infer_schema: bool = True, infer_schema_length: int | None = N_INFER_DEFAULT, n_rows: int | None = None, encoding: CsvEncoding = "utf8", @@ -998,17 +999,22 @@ def scan_csv( utf8 values to be treated as the empty string you can set this param True. ignore_errors Try to keep reading lines if some lines yield errors. - First try `infer_schema_length=0` to read all columns as + First try `infer_schema=False` to read all columns as `pl.String` to check which values might cause an issue. cache Cache the result after reading. with_column_names Apply a function over the column names just in time (when they are determined); this function will receive (and should return) a list of column names. + infer_schema + When `True`, the schema is inferred from the data using the first + `infer_schema_length` rows. + When `False`, the schema is not inferred and will be `pl.String` if not + specified in `schema` or `schema_overrides`. infer_schema_length The maximum number of rows to scan for schema inference. - If set to `0`, all columns will be read as `pl.String`. If set to `None`, the full data may be scanned *(this is slow)*. + Set `infer_schema=False` to read all columns as `pl.String`. n_rows Stop reading from CSV file after reading `n_rows`. encoding : {'utf8', 'utf8-lossy'} @@ -1038,7 +1044,7 @@ def scan_csv( scanning a headerless CSV file). If the given list is shorter than the width of the DataFrame the remaining columns will have their original name. raise_if_empty - When there is no data in the source,`NoDataError` is raised. If this parameter + When there is no data in the source, `NoDataError` is raised. If this parameter is set to False, an empty LazyFrame (with no columns) is returned instead. truncate_ragged_lines Truncate lines that are longer than the schema. @@ -1162,6 +1168,9 @@ def with_column_names(cols: list[str]) -> list[str]: normalize_filepath(source, check_not_directory=False) for source in source ] + if not infer_schema: + infer_schema_length = 0 + return _scan_csv_impl( source, has_header=has_header, diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index bc43533a3150..772a1ea74d7f 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -113,6 +113,19 @@ def test_normalize_filepath(io_files_path: Path) -> None: ) +def test_infer_schema_false() -> None: + csv = textwrap.dedent( + """\ + a,b,c + 1,2,3 + 1,2,3 + """ + ) + f = io.StringIO(csv) + df = pl.read_csv(f, infer_schema=False) + assert df.dtypes == [pl.String, pl.String, pl.String] + + def test_csv_null_values() -> None: csv = textwrap.dedent( """\