Skip to content

Commit

Permalink
Issue #162 - internal error when last cell is empty and last line is …
Browse files Browse the repository at this point in the history
…not terminated
  • Loading branch information
kosak committed Dec 2, 2023
1 parent 29847f3 commit fe676a6
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 47 deletions.
53 changes: 26 additions & 27 deletions src/main/java/io/deephaven/csv/reading/CellGrabber.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,37 +75,33 @@ public CellGrabber(
* trimming.
*
* @param dest The result, as a {@link ByteSlice}. The ByteSlice is invalidated by the next call to grabNext.
* @param lastInRow An out parameter whose contents are only specified if this method returns true. Its contents
* will be set to true if the cell just read was the last cell in the row, otherwise they will be set to
* false.
* @return true if a cell was read; false if at end of input.
* @param lastInRow An out parameter which will be set to true if the cell just read was the last cell in the row,
* otherwise it will be set to false.
* @param endOfInput An out parameter which will be set to true if the cell just read encountered the end of the
* input, otherwise it will be set to false.
*/
public boolean grabNext(final ByteSlice dest, final MutableBoolean lastInRow)
throws CsvReaderException {
public void grabNext(final ByteSlice dest, final MutableBoolean lastInRow,
final MutableBoolean endOfInput) throws CsvReaderException {
spillBuffer.clear();
startOffset = offset;

if (ignoreSurroundingSpaces) {
skipWhitespace();
}
if (!tryEnsureMore()) {
return false;
}

// Is first char the quote char?
if (buffer[offset] == quoteChar) {
if (tryEnsureMore() && buffer[offset] == quoteChar) {
++offset;
processQuotedMode(dest, lastInRow);
processQuotedMode(dest, lastInRow, endOfInput);
if (trim) {
trimWhitespace(dest);
}
} else {
processUnquotedMode(dest, lastInRow);
processUnquotedMode(dest, lastInRow, endOfInput);
if (ignoreSurroundingSpaces) {
trimWhitespace(dest);
}
}
return true;
}

/**
Expand All @@ -114,8 +110,8 @@ public boolean grabNext(final ByteSlice dest, final MutableBoolean lastInRow)
* @param lastInRow An out parameter. Its contents will be set to true if the cell just read was the last cell in
* the row, otherwise the contents will be set to false.
*/
private void processQuotedMode(final ByteSlice dest, final MutableBoolean lastInRow)
throws CsvReaderException {
private void processQuotedMode(final ByteSlice dest, final MutableBoolean lastInRow,
final MutableBoolean endOfInput) throws CsvReaderException {
startOffset = offset;
boolean prevCharWasCarriageReturn = false;
while (true) {
Expand Down Expand Up @@ -162,7 +158,7 @@ private void processQuotedMode(final ByteSlice dest, final MutableBoolean lastIn
// We got out of the quoted string. Consume any trailing matter after the quote and before the
// field
// delimiter. Hopefully that trailing matter is just whitespace, but we shall see.
finishField(dest, lastInRow);
finishField(dest, lastInRow, endOfInput);

// From this point on, note that dest is a slice that may point to the underlying input buffer
// or the spill buffer. Take care from this point on to not disturb the input (e.g. by reading
Expand All @@ -183,10 +179,10 @@ private void processQuotedMode(final ByteSlice dest, final MutableBoolean lastIn
/**
* Process characters in "unquoted mode". This is easy: eat characters until the next field or line delimiter.
*/
private void processUnquotedMode(final ByteSlice dest, final MutableBoolean lastInRow)
throws CsvReaderException {
private void processUnquotedMode(final ByteSlice dest, final MutableBoolean lastInRow,
final MutableBoolean endOfInput) throws CsvReaderException {
startOffset = offset;
finishField(dest, lastInRow);
finishField(dest, lastInRow, endOfInput);
}

/** Skip whitespace but do not consider the field delimiter to be whitespace. */
Expand All @@ -211,28 +207,30 @@ private void skipWhitespace() throws CsvReaderException {
* @param lastInRow An out parameter. Its contents are set to true if the cell was the last one in the row.
* Otherwise, its contents are set to false.
*/
private void finishField(final ByteSlice dest, final MutableBoolean lastInRow)
private void finishField(final ByteSlice dest, final MutableBoolean lastInRow,
final MutableBoolean endOfInput)
throws CsvReaderException {
while (true) {
if (offset == size) {
if (!tryEnsureMore()) {
finish(dest);
// End of file sets last in row.
lastInRow.setValue(true);
return;
}
if (!tryEnsureMore()) {
finish(dest);
// End of input sets both flags.
lastInRow.setValue(true);
endOfInput.setValue(true);
return;
}
final byte ch = buffer[offset];
if (ch == fieldDelimiter) {
finish(dest);
++offset; // ... and skip over the field delimiter.
lastInRow.setValue(false);
endOfInput.setValue(false);
return;
}
if (ch == '\n') {
finish(dest);
++offset;
lastInRow.setValue(true);
endOfInput.setValue(false);
++physicalRowNum;
return;
}
Expand All @@ -252,6 +250,7 @@ private void finishField(final ByteSlice dest, final MutableBoolean lastInRow)
finish(dest);
dest.reset(dest.data(), dest.begin(), dest.end() - excess);
lastInRow.setValue(true);
endOfInput.setValue(false);
++physicalRowNum;
return;
}
Expand Down
10 changes: 6 additions & 4 deletions src/main/java/io/deephaven/csv/reading/CsvReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ private static String[] canonicalizeHeaders(CsvSpecs specs, final String[] heade
}

/**
* Try to read one row from the input. Returns false if the input ends before one row has been read.
* Try to read one row from the input. Returns null if the input is empty
*
* @return The first row as a byte[][] or null if the input was exhausted.
*/
Expand All @@ -297,14 +297,16 @@ private static byte[][] tryReadOneRow(final CellGrabber grabber) throws CsvReade
// Grab the header
final ByteSlice slice = new ByteSlice();
final MutableBoolean lastInRow = new MutableBoolean();
final MutableBoolean endOfInput = new MutableBoolean();
do {
if (!grabber.grabNext(slice, lastInRow)) {
return null;
}
grabber.grabNext(slice, lastInRow, endOfInput);
final byte[] item = new byte[slice.size()];
slice.copyTo(item, 0);
headers.add(item);
} while (!lastInRow.booleanValue());
if (headers.size() == 1 && headers.get(0).length == 0 && endOfInput.booleanValue()) {
return null;
}
return headers.toArray(new byte[0][]);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ private static class RowAppender {
private final int numCols;
private final ByteSlice byteSlice;
private final MutableBoolean lastInRow;
private final MutableBoolean endOfInput;
private final byte[][] nullValueLiteralsAsUtf8;

public RowAppender(final String[] columnHeaders, final byte[][] optionalFirstDataRow, final CellGrabber grabber,
Expand All @@ -109,6 +110,7 @@ public RowAppender(final String[] columnHeaders, final byte[][] optionalFirstDat
}
byteSlice = new ByteSlice();
lastInRow = new MutableBoolean();
endOfInput = new MutableBoolean();
// Here we prepare ahead of time what we are going to do when we encounter a short row. Say the input looks
// like:
// 10,20,30,40,50
Expand Down Expand Up @@ -174,18 +176,15 @@ public RowResult processNextRow(final boolean writeToConsumer) throws CsvReaderE
int colNum = 0;
for (colNum = 0; colNum < numCols; ++colNum) {
try {
if (!grabber.grabNext(byteSlice, lastInRow)) {
if (colNum == 0) {
// Input exhausted
return RowResult.END_OF_INPUT;
}
// Can't get here. If there is any data at all in the last row, and *then* the file
// ends, grabNext() will return true, with lastInRow set.
throw new RuntimeException("Logic error: uncaught short last row");
}
grabber.grabNext(byteSlice, lastInRow, endOfInput);
if (lastInRow.booleanValue()) {
if (byteSlice.size() == 0 && colNum == 0 && specs.ignoreEmptyLines()) {
return RowResult.IGNORED_EMPTY_ROW;
if (colNum == 0 && byteSlice.size() == 0) {
if (endOfInput.booleanValue()) {
return RowResult.END_OF_INPUT;
}
if (specs.ignoreEmptyLines()) {
return RowResult.IGNORED_EMPTY_ROW;
}
}
appendToDenseStorageWriter(dsws[colNum], byteSlice, writeToConsumer);
++colNum;
Expand All @@ -208,10 +207,7 @@ public RowResult processNextRow(final boolean writeToConsumer) throws CsvReaderE
}
// Eat.
while (!lastInRow.booleanValue()) {
if (!grabber.grabNext(byteSlice, lastInRow)) {
// Can't happen. Won't get end of input while finishing excess row.
throw new RuntimeException("Logic error: end of input while finishing excess row");
}
grabber.grabNext(byteSlice, lastInRow, endOfInput);
}
}

Expand All @@ -226,7 +222,7 @@ public RowResult processNextRow(final boolean writeToConsumer) throws CsvReaderE
throw new CsvReaderException(message);
}

// Pad the row with a null vlaue literal appropriate for each column.
// Pad the row with a null value literal appropriate for each column.
while (colNum < numCols) {
final byte[] nvl = nullValueLiteralsAsUtf8[colNum];
if (nvl == null) {
Expand Down
19 changes: 19 additions & 0 deletions src/test/java/io/deephaven/csv/CsvReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,25 @@ public void bug133() throws CsvReaderException {
ignoringSpaces);
}

/**
* Reported in <a href="https://github.com/deephaven/deephaven-csv/issues/162">Deephaven CSV Issue #162</a>. The
* library was throwing an internal error when the last cell is empty and the last line is not terminated with a
* newline.
*/
@Test
public void bug162() throws CsvReaderException {
// Last cell is empty and the line is not terminated.
final String input = "A,B\n" +
"apple,banana\n" +
"cherry,";

final ColumnSet expected =
ColumnSet.of(
Column.ofRefs("A", "apple", "cherry"),
Column.ofRefs("B", "banana", null));
invokeTest(defaultCsvBuilder().parsers(Parsers.DEFAULT).build(), input, expected);
}

@Test
public void validates() {
final String lengthyMessage = "CsvSpecs failed validation for the following reasons: "
Expand Down

0 comments on commit fe676a6

Please sign in to comment.