Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue #162 - internal error when last cell is empty and last line is not terminated #163

Merged
merged 1 commit into from
Dec 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 26 additions & 27 deletions src/main/java/io/deephaven/csv/reading/CellGrabber.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,37 +75,33 @@ public CellGrabber(
* trimming.
*
* @param dest The result, as a {@link ByteSlice}. The ByteSlice is invalidated by the next call to grabNext.
* @param lastInRow An out parameter whose contents are only specified if this method returns true. Its contents
* will be set to true if the cell just read was the last cell in the row, otherwise they will be set to
* false.
* @return true if a cell was read; false if at end of input.
* @param lastInRow An out parameter which will be set to true if the cell just read was the last cell in the row,
* otherwise it will be set to false.
* @param endOfInput An out parameter which will be set to true if the cell just read encountered the end of the
* input, otherwise it will be set to false.
*/
public boolean grabNext(final ByteSlice dest, final MutableBoolean lastInRow)
throws CsvReaderException {
public void grabNext(final ByteSlice dest, final MutableBoolean lastInRow,
final MutableBoolean endOfInput) throws CsvReaderException {
spillBuffer.clear();
startOffset = offset;

if (ignoreSurroundingSpaces) {
skipWhitespace();
}
if (!tryEnsureMore()) {
return false;
}

// Is first char the quote char?
if (buffer[offset] == quoteChar) {
if (tryEnsureMore() && buffer[offset] == quoteChar) {
++offset;
processQuotedMode(dest, lastInRow);
processQuotedMode(dest, lastInRow, endOfInput);
if (trim) {
trimWhitespace(dest);
}
} else {
processUnquotedMode(dest, lastInRow);
processUnquotedMode(dest, lastInRow, endOfInput);
if (ignoreSurroundingSpaces) {
trimWhitespace(dest);
}
}
return true;
}

/**
Expand All @@ -114,8 +110,8 @@ public boolean grabNext(final ByteSlice dest, final MutableBoolean lastInRow)
* @param lastInRow An out parameter. Its contents will be set to true if the cell just read was the last cell in
* the row, otherwise the contents will be set to false.
*/
private void processQuotedMode(final ByteSlice dest, final MutableBoolean lastInRow)
throws CsvReaderException {
private void processQuotedMode(final ByteSlice dest, final MutableBoolean lastInRow,
final MutableBoolean endOfInput) throws CsvReaderException {
startOffset = offset;
boolean prevCharWasCarriageReturn = false;
while (true) {
Expand Down Expand Up @@ -162,7 +158,7 @@ private void processQuotedMode(final ByteSlice dest, final MutableBoolean lastIn
// We got out of the quoted string. Consume any trailing matter after the quote and before the
// field
// delimiter. Hopefully that trailing matter is just whitespace, but we shall see.
finishField(dest, lastInRow);
finishField(dest, lastInRow, endOfInput);

// From this point on, note that dest is a slice that may point to the underlying input buffer
// or the spill buffer. Take care from this point on to not disturb the input (e.g. by reading
Expand All @@ -183,10 +179,10 @@ private void processQuotedMode(final ByteSlice dest, final MutableBoolean lastIn
/**
* Process characters in "unquoted mode". This is easy: eat characters until the next field or line delimiter.
*/
private void processUnquotedMode(final ByteSlice dest, final MutableBoolean lastInRow)
throws CsvReaderException {
private void processUnquotedMode(final ByteSlice dest, final MutableBoolean lastInRow,
final MutableBoolean endOfInput) throws CsvReaderException {
startOffset = offset;
finishField(dest, lastInRow);
finishField(dest, lastInRow, endOfInput);
}

/** Skip whitespace but do not consider the field delimiter to be whitespace. */
Expand All @@ -211,28 +207,30 @@ private void skipWhitespace() throws CsvReaderException {
* @param lastInRow An out parameter. Its contents are set to true if the cell was the last one in the row.
* Otherwise, its contents are set to false.
*/
private void finishField(final ByteSlice dest, final MutableBoolean lastInRow)
private void finishField(final ByteSlice dest, final MutableBoolean lastInRow,
final MutableBoolean endOfInput)
throws CsvReaderException {
while (true) {
if (offset == size) {
if (!tryEnsureMore()) {
finish(dest);
// End of file sets last in row.
lastInRow.setValue(true);
return;
}
if (!tryEnsureMore()) {
finish(dest);
// End of input sets both flags.
lastInRow.setValue(true);
endOfInput.setValue(true);
return;
}
final byte ch = buffer[offset];
if (ch == fieldDelimiter) {
finish(dest);
++offset; // ... and skip over the field delimiter.
lastInRow.setValue(false);
endOfInput.setValue(false);
return;
}
if (ch == '\n') {
finish(dest);
++offset;
lastInRow.setValue(true);
endOfInput.setValue(false);
++physicalRowNum;
return;
}
Expand All @@ -252,6 +250,7 @@ private void finishField(final ByteSlice dest, final MutableBoolean lastInRow)
finish(dest);
dest.reset(dest.data(), dest.begin(), dest.end() - excess);
lastInRow.setValue(true);
endOfInput.setValue(false);
++physicalRowNum;
return;
}
Expand Down
10 changes: 6 additions & 4 deletions src/main/java/io/deephaven/csv/reading/CsvReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ private static String[] canonicalizeHeaders(CsvSpecs specs, final String[] heade
}

/**
* Try to read one row from the input. Returns false if the input ends before one row has been read.
* Try to read one row from the input. Returns null if the input is empty
*
* @return The first row as a byte[][] or null if the input was exhausted.
*/
Expand All @@ -297,14 +297,16 @@ private static byte[][] tryReadOneRow(final CellGrabber grabber) throws CsvReade
// Grab the header
final ByteSlice slice = new ByteSlice();
final MutableBoolean lastInRow = new MutableBoolean();
final MutableBoolean endOfInput = new MutableBoolean();
do {
if (!grabber.grabNext(slice, lastInRow)) {
return null;
}
grabber.grabNext(slice, lastInRow, endOfInput);
final byte[] item = new byte[slice.size()];
slice.copyTo(item, 0);
headers.add(item);
} while (!lastInRow.booleanValue());
if (headers.size() == 1 && headers.get(0).length == 0 && endOfInput.booleanValue()) {
return null;
}
return headers.toArray(new byte[0][]);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ private static class RowAppender {
private final int numCols;
private final ByteSlice byteSlice;
private final MutableBoolean lastInRow;
private final MutableBoolean endOfInput;
private final byte[][] nullValueLiteralsAsUtf8;

public RowAppender(final String[] columnHeaders, final byte[][] optionalFirstDataRow, final CellGrabber grabber,
Expand All @@ -109,6 +110,7 @@ public RowAppender(final String[] columnHeaders, final byte[][] optionalFirstDat
}
byteSlice = new ByteSlice();
lastInRow = new MutableBoolean();
endOfInput = new MutableBoolean();
// Here we prepare ahead of time what we are going to do when we encounter a short row. Say the input looks
// like:
// 10,20,30,40,50
Expand Down Expand Up @@ -174,18 +176,15 @@ public RowResult processNextRow(final boolean writeToConsumer) throws CsvReaderE
int colNum = 0;
for (colNum = 0; colNum < numCols; ++colNum) {
try {
if (!grabber.grabNext(byteSlice, lastInRow)) {
if (colNum == 0) {
// Input exhausted
return RowResult.END_OF_INPUT;
}
// Can't get here. If there is any data at all in the last row, and *then* the file
// ends, grabNext() will return true, with lastInRow set.
throw new RuntimeException("Logic error: uncaught short last row");
}
grabber.grabNext(byteSlice, lastInRow, endOfInput);
if (lastInRow.booleanValue()) {
if (byteSlice.size() == 0 && colNum == 0 && specs.ignoreEmptyLines()) {
return RowResult.IGNORED_EMPTY_ROW;
if (colNum == 0 && byteSlice.size() == 0) {
if (endOfInput.booleanValue()) {
return RowResult.END_OF_INPUT;
}
if (specs.ignoreEmptyLines()) {
return RowResult.IGNORED_EMPTY_ROW;
}
}
appendToDenseStorageWriter(dsws[colNum], byteSlice, writeToConsumer);
++colNum;
Expand All @@ -208,10 +207,7 @@ public RowResult processNextRow(final boolean writeToConsumer) throws CsvReaderE
}
// Eat.
while (!lastInRow.booleanValue()) {
if (!grabber.grabNext(byteSlice, lastInRow)) {
// Can't happen. Won't get end of input while finishing excess row.
throw new RuntimeException("Logic error: end of input while finishing excess row");
}
grabber.grabNext(byteSlice, lastInRow, endOfInput);
}
}

Expand All @@ -226,7 +222,7 @@ public RowResult processNextRow(final boolean writeToConsumer) throws CsvReaderE
throw new CsvReaderException(message);
}

// Pad the row with a null vlaue literal appropriate for each column.
// Pad the row with a null value literal appropriate for each column.
while (colNum < numCols) {
final byte[] nvl = nullValueLiteralsAsUtf8[colNum];
if (nvl == null) {
Expand Down
19 changes: 19 additions & 0 deletions src/test/java/io/deephaven/csv/CsvReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,25 @@ public void bug133() throws CsvReaderException {
ignoringSpaces);
}

/**
* Reported in <a href="https://github.com/deephaven/deephaven-csv/issues/162">Deephaven CSV Issue #162</a>. The
* library was throwing an internal error when the last cell is empty and the last line is not terminated with a
* newline.
*/
@Test
public void bug162() throws CsvReaderException {
// Last cell is empty and the line is not terminated.
final String input = "A,B\n" +
"apple,banana\n" +
"cherry,";

final ColumnSet expected =
ColumnSet.of(
Column.ofRefs("A", "apple", "cherry"),
Column.ofRefs("B", "banana", null));
invokeTest(defaultCsvBuilder().parsers(Parsers.DEFAULT).build(), input, expected);
}

@Test
public void validates() {
final String lengthyMessage = "CsvSpecs failed validation for the following reasons: "
Expand Down