Skip to content

Commit

Permalink
Add extra columns when reading Delimited file (#12231)
Browse files Browse the repository at this point in the history
- Closes #12186
  • Loading branch information
radeusgd authored Feb 5, 2025
1 parent 7e651ea commit a829bdb
Show file tree
Hide file tree
Showing 8 changed files with 219 additions and 37 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,17 @@
- [Reducing helper methods in `Standard.Base.Meta`.][12031]
- [Added Table.Offset][12071]
- [Added Column.Offset][12092]
- [When reading a Delimited file, if a row with more columns than expected is
encountered, extra columns can be added to the result.][12231]
- In `Delimited` format, the `keep_invalid_rows` setting has been renamed to
`on_invalid_rows`. The default behaviour was also changed to add any extra
columns instead of discarding them.

[11926]: https://github.com/enso-org/enso/pull/11926
[12031]: https://github.com/enso-org/enso/pull/12031
[12071]: https://github.com/enso-org/enso/pull/12071
[12092]: https://github.com/enso-org/enso/pull/12092
[12231]: https://github.com/enso-org/enso/pull/12231

#### Enso Language & Runtime

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ from Standard.Base.System.File_Format import parse_boolean_with_infer
from Standard.Base.Widget_Helpers import make_file_read_delimiter_selector

import project.Data_Formatter.Data_Formatter
import project.Delimited.Invalid_Rows.Invalid_Rows
import project.Delimited.Quote_Style.Quote_Style
import project.Headers.Headers
import project.Internal.Delimited_Reader
Expand Down Expand Up @@ -45,9 +46,8 @@ type Delimited_Format
appended to disambiguate them.
- value_formatter: Formatter to parse text values into numbers, dates,
times, etc. If `Nothing` values are left as Text.
- keep_invalid_rows: Specifies whether rows that contain less or more
columns than expected should be kept (setting the missing columns to
`Nothing` or dropping the excess columns) or dropped.
- on_invalid_rows: Specifies how to handle rows that have less or more
columns than the first row.
- line_endings: Sets the line ending style to use. Defaults to `Infer` -
when reading a file or appending to an existing file, the line endings
are detected from file contents; when writing a new file in `Infer`
Expand All @@ -61,7 +61,7 @@ type Delimited_Format
@delimiter make_file_read_delimiter_selector
@encoding Encoding.default_widget
@row_limit Rows_To_Read.default_widget
Delimited (delimiter:Text=',') (encoding:Encoding=Encoding.default) (skip_rows:Integer=0) (row_limit:Rows_To_Read=..All_Rows) (quote_style:Quote_Style=Quote_Style.With_Quotes) (headers:Headers=Headers.Detect_Headers) (value_formatter:Data_Formatter|Nothing=Data_Formatter.Value) (keep_invalid_rows:Boolean=True) (line_endings:Line_Ending_Style|Infer=Infer) (comment_character:Text|Nothing=Nothing)
Delimited (delimiter:Text=',') (encoding:Encoding=Encoding.default) (skip_rows:Integer=0) (row_limit:Rows_To_Read=..All_Rows) (quote_style:Quote_Style=Quote_Style.With_Quotes) (headers:Headers=Headers.Detect_Headers) (value_formatter:Data_Formatter|Nothing=Data_Formatter.Value) (on_invalid_rows:Invalid_Rows=Invalid_Rows.Add_Extra_Columns) (line_endings:Line_Ending_Style|Infer=Infer) (comment_character:Text|Nothing=Nothing)

## PRIVATE
Resolve an unresolved constructor to the actual type.
Expand Down Expand Up @@ -125,8 +125,8 @@ type Delimited_Format
## PRIVATE
Clone the instance with some properties overridden.
clone : Encoding -> Quote_Style -> Headers -> (Data_Formatter|Nothing) -> Boolean -> (Text|Nothing) -> (Text|Nothing) -> Delimited_Format
clone self (encoding:Encoding = self.encoding) (quote_style:Quote_Style=self.quote_style) (headers:Headers=self.headers) (value_formatter=self.value_formatter) (keep_invalid_rows:Boolean=self.keep_invalid_rows) (line_endings=self.line_endings) (comment_character=self.comment_character) =
Delimited_Format.Delimited self.delimiter encoding self.skip_rows self.row_limit quote_style headers value_formatter keep_invalid_rows line_endings comment_character
clone self (encoding:Encoding = self.encoding) (quote_style:Quote_Style=self.quote_style) (headers:Headers=self.headers) (value_formatter=self.value_formatter) (on_invalid_rows:Invalid_Rows=self.on_invalid_rows) (line_endings=self.line_endings) (comment_character=self.comment_character) =
Delimited_Format.Delimited self.delimiter encoding self.skip_rows self.row_limit quote_style headers value_formatter on_invalid_rows line_endings comment_character

## ICON data_input
Create a clone of this with specified quoting settings.
Expand Down Expand Up @@ -194,14 +194,19 @@ Delimited_Format.from (that : JS_Object) =
headers = that.get "headers" |> parse_boolean_with_infer "headers"
skip_rows = that.get "skip_rows" . if_nothing 0
row_limit = that.get "row_limit"
keep_invalid_rows = that.get "keep_invalid_rows" . if_nothing True
on_invalid_rows = case that.get "on_invalid_rows" of
True -> Invalid_Rows.Keep_Invalid_Rows
False -> Invalid_Rows.Drop_Invalid_Rows
"add_extra_columns" -> Invalid_Rows.Add_Extra_Columns
Nothing -> Invalid_Rows.Add_Extra_Columns
other -> Error.throw (Illegal_Argument.Error "Invalid value for `on_invalid_rows`: "+other.to_display_text)
quote_style = case that.get "quote_style" of
Nothing -> Quote_Style.With_Quotes
json -> Quote_Style.from json

unsupported_fields = ["value_formatter", "line_endings", "comment_character"]
case unsupported_fields.find that.contains_key if_missing=Nothing of
Nothing ->
Delimited_Format.Delimited delimiter=delimiter encoding=encoding headers=headers skip_rows=skip_rows row_limit=row_limit quote_style=quote_style keep_invalid_rows=keep_invalid_rows
Delimited_Format.Delimited delimiter=delimiter encoding=encoding headers=headers skip_rows=skip_rows row_limit=row_limit quote_style=quote_style on_invalid_rows=on_invalid_rows
field ->
Error.throw (Illegal_Argument.Error ("The field `" ++ field ++ "` is currently not supported when deserializing the Delimited format from JSON."))
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from Standard.Base import Boolean, False, True

## Describes what to do with rows that have unexpected number of columns.
type Invalid_Rows
## Rows that contain too few or too many columns are dropped.
Drop_Invalid_Rows

## Rows that contain too few or too many columns are kept

- If a row has too few columns, the missing columns are filled with `Nothing`.
- If it has too many, the extra columns are dropped.
Keep_Invalid_Rows

## Rows that contain more columns than expected are kept, and the extra columns are added.

- If a row has too few columns, the missing columns are filled with `Nothing`.
- If it has too many, the extra columns are kept. The previous rows that
had less columns are filled with `Nothing`.
Add_Extra_Columns

## PRIVATE
A conversion for backward compatibility.
Invalid_Rows.from (that : Boolean) = case that of
True -> Invalid_Rows.Keep_Invalid_Rows
False -> Invalid_Rows.Drop_Invalid_Rows
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import Standard.Base.System.Input_Stream.Input_Stream

import project.Data_Formatter.Data_Formatter
import project.Delimited.Delimited_Format.Delimited_Format
import project.Delimited.Invalid_Rows.Invalid_Rows
import project.Delimited.Quote_Style.Quote_Style
import project.Headers.Headers
import project.Internal.Java_Problems
Expand All @@ -25,6 +26,7 @@ polyglot java import org.enso.table.parsing.problems.MismatchedQuote
polyglot java import org.enso.table.parsing.TypeInferringParser
polyglot java import org.enso.table.read.DelimitedReader
polyglot java import org.enso.table.read.DelimitedReader.HeaderBehavior
polyglot java import org.enso.table.read.DelimitedReader.InvalidRowsBehavior
polyglot java import org.enso.table.read.ParsingFailedException
polyglot java import org.enso.table.read.QuoteStrippingParser

Expand Down Expand Up @@ -124,17 +126,25 @@ prepare_reader format:Delimited_Format max_columns on_problems:Problem_Behavior
newline = newline_override.if_nothing <| case format.line_endings of
Infer -> Nothing
endings -> endings.to_text
on_invalid_rows_java = case format.on_invalid_rows of
Invalid_Rows.Keep_Invalid_Rows -> InvalidRowsBehavior.KEEP
Invalid_Rows.Drop_Invalid_Rows -> InvalidRowsBehavior.DROP
Invalid_Rows.Add_Extra_Columns -> InvalidRowsBehavior.ADD_EXTRA_COLUMNS

warnings_as_errors = on_problems == Problem_Behavior.Report_Error
DelimitedReader.new format.delimiter quote_characters.first quote_characters.second java_headers format.skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows newline format.comment_character warnings_as_errors java_problem_aggregator
DelimitedReader.new format.delimiter quote_characters.first quote_characters.second java_headers format.skip_rows row_limit max_columns value_parser cell_type_guesser on_invalid_rows_java newline format.comment_character warnings_as_errors java_problem_aggregator

## PRIVATE
An internal type representing columns deduced from an existing file.
type Detected_Headers
## Represents the headers found in the file.
Existing (column_names : Vector Text)

## Indicates that the file exists but no headers have been found, so only positional column matching is possible.
## Indicates that the file exists but no headers have been found,
so only positional column matching is possible.

Note that the file may still contain rows that have less or more columns
than specified here. This column count is only based on the first row.
None (column_count : Integer)

## PRIVATE
Expand Down
13 changes: 13 additions & 0 deletions std-bits/table/src/main/java/org/enso/table/data/table/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ public Table(Column[] columns) {
throw new IllegalArgumentException("Column names must be unique within a Table.");
}

assert checkAllColumnsHaveSameSize(columns) : "All columns must have the same row count.";

this.columns = columns;
}

Expand All @@ -67,6 +69,17 @@ private static boolean checkUniqueColumns(Column[] columns) {
return true;
}

private static boolean checkAllColumnsHaveSameSize(Column[] columns) {
int size = columns[0].getSize();
for (Column column : columns) {
if (column.getSize() != size) {
return false;
}
}

return true;
}

/**
* @return the number of rows in this table
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.enso.table.read;

import java.util.List;

/**
* Metadata that can be detected by the DelimitedReader.
*
Expand All @@ -14,6 +16,6 @@
*/
public record DelimitedFileMetadata(
long columnCount,
String[] definedColumnNames,
List<String> definedColumnNames,
boolean hasAnyContent,
String effectiveLineSeparator) {}
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public class DelimitedReader {
private final CsvParser parser;
private final DatatypeParser valueParser;
private final TypeInferringParser cellTypeGuesser;
private final boolean keepInvalidRows;
private final InvalidRowsBehavior keepInvalidRows;
private String newlineSetting;
private final NoOpParseProblemAggregator noOpProblemAggregator = new NoOpParseProblemAggregator();
private long targetTableIndex = 0;
Expand All @@ -69,6 +69,7 @@ public class DelimitedReader {
private long currentLine = 0;

private List<BuilderForType<String>> builders = null;
private int initialColumnCount = 0;
private final DelimitedReaderProblemAggregator problemAggregator;

/**
Expand Down Expand Up @@ -111,7 +112,7 @@ public DelimitedReader(
int maxColumns,
DatatypeParser valueParser,
TypeInferringParser cellTypeGuesser,
boolean keepInvalidRows,
InvalidRowsBehavior keepInvalidRows,
String newline,
String commentCharacter,
boolean warningsAsErrors,
Expand Down Expand Up @@ -266,17 +267,36 @@ private void appendRow(String[] row) {
assert canFitMoreRows();

if (row.length != builders.size()) {
problemAggregator.reportInvalidRow(
currentLine, keepInvalidRows ? targetTableIndex : null, row, builders.size());
boolean isRowKept =
switch (keepInvalidRows) {
case DROP -> false;
case KEEP, ADD_EXTRA_COLUMNS -> true;
};

// The error is only reported if the column count does not match the initial column count.
// Otherwise, a single row with more columns in ADD_EXTRA_COLUMNS mode will expand the
// builders and all subsequent rows (that had original column count) would turn into warnings.
// Such flood of warnings is not useful. Instead, we only warn on the occurrences that expand
// the column count, or that have fewer columns than originally expected.
if (row.length != initialColumnCount) {
problemAggregator.reportInvalidRow(
currentLine, isRowKept ? targetTableIndex : null, row, builders.size());
}

if (isRowKept) {
// If the current row had more columns than expected, they are either discarded or added as
// extra columns.
if (keepInvalidRows == InvalidRowsBehavior.ADD_EXTRA_COLUMNS
&& row.length > builders.size()) {
addExtraColumns(row.length - builders.size());
}

if (keepInvalidRows) {
for (int i = 0; i < builders.size() && i < row.length; i++) {
builders.get(i).append(row[i]);
}

// If the current row had fewer columns than expected, nulls are inserted for the missing
// values.
// If it had more columns, the excess columns are discarded.
for (int i = row.length; i < builders.size(); i++) {
builders.get(i).appendNulls(1);
}
Expand All @@ -292,6 +312,17 @@ private void appendRow(String[] row) {
}
}

private void addExtraColumns(int count) {
for (int i = 0; i < count; i++) {
int columnIndex = builders.size() + 1;
effectiveColumnNames.add(COLUMN_NAME + " " + columnIndex);
var builder = constructBuilder(targetTableIndex);
// We ensure the new builder has the same length as the previous ones by padding with nulls.
builder.appendNulls(Math.toIntExact(targetTableIndex));
builders.add(builder);
}
}

private boolean canFitMoreRows() {
return rowLimit < 0 || targetTableIndex < rowLimit;
}
Expand Down Expand Up @@ -324,18 +355,18 @@ private boolean isPlainText(String cell) {
}

/** The column names as defined in the input (if applicable, otherwise null). */
private String[] definedColumnNames = null;
private List<String> definedColumnNames = null;

/**
* The effective column names.
*
* <p>If {@code GENERATE_HEADERS} is used or if {@code INFER} is used and no headers are found,
* this will be populated with automatically generated column names.
*/
private String[] effectiveColumnNames;
private List<String> effectiveColumnNames;

private int getColumnCount() {
return effectiveColumnNames.length;
return effectiveColumnNames.size();
}

/**
Expand Down Expand Up @@ -380,7 +411,7 @@ private void detectHeaders() {
}

if (firstRow == null) {
effectiveColumnNames = new String[0];
effectiveColumnNames = List.of();
return;
}

Expand Down Expand Up @@ -423,9 +454,11 @@ private void detectHeaders() {
default -> throw new IllegalStateException("Impossible branch.");
}

effectiveColumnNames = headerNames.toArray(new String[0]);
effectiveColumnNames = headerNames;
if (wereHeadersDefined) {
definedColumnNames = effectiveColumnNames;
// We need a copy of the defined column names, as the effective column names may be modified
// later.
definedColumnNames = new ArrayList<>(effectiveColumnNames);
}
}

Expand All @@ -445,6 +478,7 @@ public Table read(Reader input) {
throw new EmptyFileException();
}

initialColumnCount = columnCount;
initBuilders(columnCount);
while (canFitMoreRows()) {
var currentRow = readNextRow();
Expand All @@ -461,7 +495,7 @@ public Table read(Reader input) {

Column[] columns = new Column[builders.size()];
for (int i = 0; i < builders.size(); i++) {
String columnName = effectiveColumnNames[i];
String columnName = effectiveColumnNames.get(i);
var stringStorage = builders.get(i).seal();

// We don't expect InvalidFormat to be propagated back to Enso, there is no particular type
Expand Down Expand Up @@ -493,10 +527,14 @@ private void markUsed() {
private void initBuilders(int count) {
builders = new ArrayList<>(count);
for (int i = 0; i < count; i++) {
builders.add(Builder.getForText(TextType.VARIABLE_LENGTH, INITIAL_ROW_CAPACITY));
builders.add(constructBuilder(INITIAL_ROW_CAPACITY));
}
}

private BuilderForType<String> constructBuilder(long initialCapacity) {
return Builder.getForText(TextType.VARIABLE_LENGTH, initialCapacity);
}

/** Specifies how to set the headers for the returned table. */
public enum HeaderBehavior {
/** Tries to infer if the headers are present in the file. */
Expand All @@ -510,4 +548,16 @@ public enum HeaderBehavior {
*/
GENERATE_HEADERS
}

/** Specifies how to handle rows with unexpected number of columns. */
public enum InvalidRowsBehavior {
/** Discards rows with unexpected number of columns. */
DROP,

/** Keeps rows with unexpected number of columns, but the additional columns are discarded. */
KEEP,

/** Keeps rows with unexpected number of columns, adding extra columns. */
ADD_EXTRA_COLUMNS
}
}
Loading

0 comments on commit a829bdb

Please sign in to comment.