diff --git a/extensions/csv/src/main/java/io/deephaven/csv/CsvSpecs.java b/extensions/csv/src/main/java/io/deephaven/csv/CsvSpecs.java index 1fb56527dc1..ea6a1763bbc 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/CsvSpecs.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/CsvSpecs.java @@ -263,7 +263,7 @@ public Charset charset() { } /** - * Should the CSVReader run asynchronously for better performance. + * Should the CSVReader run its processing steps on multiple threads for better performance. * @return the async flag */ @Default diff --git a/extensions/csv/src/main/java/io/deephaven/csv/containers/ByteSlice.java b/extensions/csv/src/main/java/io/deephaven/csv/containers/ByteSlice.java index 6100c8563d7..178b092a2fc 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/containers/ByteSlice.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/containers/ByteSlice.java @@ -34,18 +34,6 @@ public byte back() { return data[end - 1]; } - public void copyTo(byte[] dest, int destOffset) { - for (int cur = begin; cur != end; ++cur) { - dest[destOffset++] = data[cur]; - } - } - - public void copyTo(char[] dest, int destOffset) { - for (int cur = begin; cur != end; ++cur) { - dest[destOffset++] = (char)data[cur]; - } - } - public byte[] data() { return data; } public int begin() { return begin; } public int end() { return end; } diff --git a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageReader.java b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageReader.java index 1cfdfc6ae84..5271dbd52ec 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageReader.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageReader.java @@ -3,9 +3,10 @@ import io.deephaven.csv.containers.ByteSlice; import io.deephaven.csv.containers.CharSlice; +import io.deephaven.csv.util.CsvReaderException; /** - * Companion to the DenseStorageWriter. + * Companion to the {@link DenseStorageWriter}. See the documentation there for details. */ public final class DenseStorageReader { /** @@ -33,11 +34,11 @@ public final class DenseStorageReader { */ private final int[] intHolder = new int[1]; - public DenseStorageReader(QueueReader.IntReader controlReader, - QueueReader.ByteReader byteReader, - QueueReader.CharReader charReader, - QueueReader.ByteArrayReader largeByteArrayReader, - QueueReader.CharArrayReader largeCharArrayReader) { + public DenseStorageReader(final QueueReader.IntReader controlReader, + final QueueReader.ByteReader byteReader, + final QueueReader.CharReader charReader, + final QueueReader.ByteArrayReader largeByteArrayReader, + final QueueReader.CharArrayReader largeCharArrayReader) { this.controlReader = controlReader; this.byteReader = byteReader; this.charReader = charReader; @@ -45,7 +46,8 @@ public DenseStorageReader(QueueReader.IntReader controlReader, this.largeCharArrayReader = largeCharArrayReader; } - public boolean tryGetNextSlice(ByteSlice bs, CharSlice cs, boolean[] nextIsBytes) { + public boolean tryGetNextSlice(final ByteSlice bs, final CharSlice cs, final boolean[] nextIsBytes) + throws CsvReaderException { if (!controlReader.tryGetInt(intHolder)) { return false; } @@ -70,10 +72,10 @@ public boolean tryGetNextSlice(ByteSlice bs, CharSlice cs, boolean[] nextIsBytes return true; } - private static void mustSucceed(boolean success, String what) { + private static void mustSucceed(final boolean success, final String what) throws CsvReaderException { if (success) { return; } - throw new RuntimeException("Data unexpectedly exhausted: " + what); + throw new CsvReaderException("Data unexpectedly exhausted: " + what); } } diff --git a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageWriter.java b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageWriter.java index 4fdfc09e9ab..e5b34a3a650 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageWriter.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageWriter.java @@ -3,16 +3,82 @@ import io.deephaven.csv.containers.CharSlice; /** - * The point of this object is to store data with a small fraction of overhead. "Large" objects (byte or char sequences - * with length >= a threshold) are stored directly. "Small" objects (byte or char sequences with a smaller length) are - * compacted into byte and char pools. + * The DenseStorageWriter and {@link DenseStorageReader} work in tandem, forming a FIFO queue. The DenseStorageWriter writes + * data, and the {@link DenseStorageReader} reads that data. If the {@link DenseStorageReader} "catches up", it will block until + * the DenseStorageWriter provides more data, or indicates that it is done (via the {@link #finish()} method. + * This synchronization is done at "block" granularity, so the DenseStorageReader can only proceed when the + * DenseStorageWriter has written at least a "block" of data or is done. We allow multiple independent + * {@link DenseStorageReader}s to consume the same underlying data. In our implementation this is used so our + * type inferencer can take a second "pass" over the same input data. + * + * The point of this object is to store a sequence of (character sequences aka "strings", but not java.lang.String), + * using a small fraction of overhead. The problem with storing every character sequence as a java.lang.String is: + *
    + *
  1. Per-object overhead (8 or 16 bytes)
  2. + *
  3. The memory cost of holding a reference to that String (4 or 8 bytes)
  4. + *
  5. The string has to know its length (4 bytes)
  6. + *
  7. Java characters are 2 bytes even though in practice many strings are ASCII-only and their chars can fit in a byte
  8. + *
+ * + * For small strings (say the word "hello" or the input text "12345.6789" ) the overhead can be 100% or worse. + * + * For our purposes we: + *
    + *
  1. Only need sequential access. i.e. we don't need random access into the sequence of "strings". So we can support + * a model where we can have a forward-only cursor moving over the sequence of "strings".
  2. + *
  3. Don't need to give our caller a data structure that they can hold on to. The caller only gets a "view" + * (a slice) of the current "string" data. The view is invalidated when they move to the next "string"
  4. + *
+ * + * Furthermore we: + *
    + *
  1. Offer a FIFO model where the reader (in a separate thread) can chase the writer but there is not an + * inordinate amount of synchronization overhead.
  2. + *
  3. Have the ability to make multiple Readers which pass over the same underlying data. This is our + * low-drama way of allowing our client to make multiple passes over the data, without complicating the iteration + * interface, with, e.g., a reset method.
  4. + *
  5. Use a linked-list structure so that when all existing readers have move passed a block of data, that + * block can be freed by the garbage collector.
  6. + *
+ * + * If you are familiar with the structure of our inference, you may initially think that this reader-chasing-writer + * garbage collection trick doesn't buy us much because we have a two-phase parser. However, when the inferencer has + * gotten to the last parser in its set of allowable parsers (say, the String parser), or the user has specified + * that there is only one parser for this column, then the code doesn't need to do any inference and can parse the + * column in one pass. (TODO(kosak): one-pass not implemented yet. Coming shortly). + * + * The implementation used here is to look at the "string" being added to the writer and categorize it along two + * dimensions: + * + * + * These dimensions are broken out in the following way: + *
  • Small byte "strings" are packed into a byte block, and we maintain a linked list of these byte blocks
  • + *
  • Small char "strings" are packed into a char block, and we maintain a linked list of these char blocks
  • + *
  • "Large" objects (byte or char sequences with length >= a threshold) are stored directly, meaning a byte[] or char[] + * array is allocated for their data, then a reference to that array is added to a byte-array or char-array block. + * (And again, we maintain a linked list of these byte-array or char-array blocks). + * It is not typical for CSV data to contain a cell this large, but the feature is there for completeness. + * We do not want want large "strings" to contaminate our byte and char blocks because they would not likely pack + * into them tightly. It's OK to keep them on their own because by definition, large "strings" are not going to have + * much overhead, as a percentage of their size. + *
  • + * */ public final class DenseStorageWriter { /** - * The ints in this array indicate where the next item is stored: Integer.MIN_VALUE: largeStringWriter - * Integer.MAX_VALUE: largeByteWriter == 0: no bytes or characters, so they're not stored anywhere otherwise < 0: - * charWriter (the number of chars is the negative of this value) otherwise >= 0 : byteWriter (the number of bytes - * is equal to this value) + * The ints in this array indicate where the next item is stored: + * */ private final QueueWriter.IntWriter controlWriter; /** diff --git a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueNode.java b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueNode.java index b231cb7ca07..ad6436c6c67 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueNode.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueNode.java @@ -1,10 +1,18 @@ package io.deephaven.csv.densestorage; +/** + * Linked list node that holds data for a {@link DenseStorageWriter} or {@link DenseStorageReader}. + * All fields are immutable except the "next" field. Synchronization for reading/writing the "next" field + * is managed by the {@link DenseStorageWriter} and {@link DenseStorageReader}. + */ public final class QueueNode { public final TARRAY data; public final int begin; public final int end; public final boolean isLast; + /** + * Readers and writers of this field have arranged to synchronize with each other. + */ public QueueNode next; public QueueNode(TARRAY data, int begin, int end, boolean isLast) { diff --git a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueReader.java b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueReader.java index d40c1941eb6..0f4c1b8b112 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueReader.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueReader.java @@ -3,6 +3,9 @@ import io.deephaven.csv.containers.ByteSlice; import io.deephaven.csv.containers.CharSlice; +/** + * Companion to the {@link QueueWriter}. See the documentation there for details. + */ public class QueueReader { private final Object sync; private QueueNode node; @@ -105,6 +108,9 @@ public boolean tryGetChars(int size, CharSlice cs) { } } + /** + * A QueueReader specialized for bytes. + */ public static final class ByteReader extends QueueReader { private byte[] typedBlock; @@ -131,6 +137,9 @@ public boolean tryGetBytes(int size, ByteSlice bs) { } } + /** + * A QueueReader specialized for ints. + */ public static final class IntReader extends QueueReader { private int[] typedBlock; @@ -157,6 +166,9 @@ public boolean tryGetInt(int[] result) { } } + /** + * A QueueReader specialized for byte arrays. + */ public static final class ByteArrayReader extends QueueReader { private byte[][] typedBlock; @@ -183,6 +195,9 @@ public boolean tryGetBytes(ByteSlice bs) { } } + /** + * A QueueReader specialized for char arrays. + */ public static final class CharArrayReader extends QueueReader { private char[][] typedBlock; diff --git a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueWriter.java b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueWriter.java index 8b86c668c89..7885109537c 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueWriter.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueWriter.java @@ -5,6 +5,22 @@ import java.util.function.BiFunction; import java.util.function.IntFunction; +/** + * The various QueueWriters ({@link ByteWriter}, {@link CharWriter}, etc.) + * work in tandem with their corresponding {@link QueueReader}s ({@link QueueReader.ByteReader}, + * {@link QueueReader.CharReader}, etc), forming a FIFO queue. The QueueWriter writes + * data, and the {@link QueueReader} reads that data. If the {@link QueueReader} "catches up", it will block until + * the QueueWriter provides more data, or indicates that it is done (via the {@link #finish()} method. + * This synchronization is done at "block" granularity, so the {@link QueueReader} can only proceed when the + * QueueWriter has written at least a "block" of data or is done. We allow multiple independent + * {@link QueueReader}s to consume the same underlying data. In our implementation this is used so our + * type inferencer can take a second "pass" over the same input data. + * + * In our implementation the {@link DenseStorageWriter} and {@link DenseStorageReader} are built out of + * various QueueWriters and {@link QueueReader}s. This explains why the semantics of + * {@link DenseStorageWriter} and {@link DenseStorageReader} are similar to those of the underlying + * QueueWriters and {@link QueueReader}s. + */ public class QueueWriter { private final Object sync; private QueueNode tail; @@ -17,9 +33,9 @@ public class QueueWriter { protected int current; protected int end; - protected QueueWriter(int blobSize, - IntFunction arrayFactory, - BiFunction, TREADER> readerFactory) { + protected QueueWriter(final int blobSize, + final IntFunction arrayFactory, + final BiFunction, TREADER> readerFactory) { this.sync = new Object(); // Placeholder object at head of linked list this.tail = new QueueNode<>(null, 0, 0, false); @@ -33,6 +49,9 @@ protected QueueWriter(int blobSize, this.end = 0; } + /** + * Caller is finished writing. + */ public void finish() { flush(true); genericBlock = null; // hygeine @@ -41,6 +60,10 @@ public void finish() { end = 0; } + /** + * Make a {@link QueueReader} corresponding to this QueueWriter. You can make as many + * {@link QueueReader}s as you want, but you should make them before you start writing data. + */ public TREADER newReader() { if (!allowReaderCreation) { throw new RuntimeException("Must allocate readers before writing any data"); @@ -49,8 +72,8 @@ public TREADER newReader() { } /** - * This supports an "early flush" for callers like DenseStorageWriter who want to flush all their queues from time - * to time. + * This supports an "early flush" for callers like {@link DenseStorageWriter} who want to flush all their queues + * from time to time. */ public void flush() { flush(false); @@ -64,8 +87,8 @@ public void flush() { */ private void flush(boolean isLast) { // Sometimes our users ask us to flush even if there is nothing to flush. - // We need to flush "isLast" blocks (whether or not they contain data) and - // we need to flush blocks containing data. We don't need to flush empty blocks. + // If the block is an "isLast" block, we need to flush it regardless of whether it contains data. + // Otherwise (if the block is not an "isLast" block), we only flush it if it contains data. if (!isLast && (begin == end)) { // No need to flush. return; @@ -93,6 +116,9 @@ protected final TARRAY flushAndAllocate(int additional) { return genericBlock; } + /** + * A QueueWriter specialized for chars. + */ public static final class CharWriter extends QueueWriter { private char[] block = null; @@ -115,6 +141,9 @@ public boolean addChars(CharSlice cs) { } } + /** + * A QueueWriter specialized for bytes. + */ public static final class ByteWriter extends QueueWriter { private byte[] block = null; @@ -140,6 +169,9 @@ public boolean addBytesFromCharSlice(CharSlice cs) { } } + /** + * A QueueWriter specialized for ints. + */ public static final class IntWriter extends QueueWriter { private int[] block = null; @@ -160,7 +192,9 @@ public boolean addInt(int value) { } } - + /** + * A QueueWriter specialized for byte arrays. + */ public static final class ByteArrayWriter extends QueueWriter { private byte[][] block = null; @@ -181,6 +215,9 @@ public boolean addByteArray(byte[] value) { } } + /** + * A QueueWriter specialized for char arrays. + */ public static final class CharArrayWriter extends QueueWriter { private char[][] block = null; diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/BooleanParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/BooleanParser.java index 3a47b4506ad..37a73db73e3 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/BooleanParser.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/BooleanParser.java @@ -3,6 +3,7 @@ import io.deephaven.csv.sinks.Sink; import io.deephaven.csv.tokenization.Tokenizer; import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; public final class BooleanParser extends ParserBase { public static BooleanParser INSTANCE = new BooleanParser(); @@ -10,13 +11,14 @@ public final class BooleanParser extends ParserBase { private BooleanParser() {} @Override - public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) { + public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) throws CsvReaderException { return twoPhaseDriver(ctx, ih, ihAlt, BooleanParser::tryParseHelper, ctx.sinkFactory::makeBooleanAsByteSink); } private static boolean tryParseHelper(ParseContext ctx, IteratorHolder ih, - Sink sink, long current, long end) { + Sink sink, long current, long end) + throws CsvReaderException { final byte[] chunk = new byte[DEST_BLOCK_SIZE]; final boolean[] boolHolder = new boolean[1]; final Tokenizer t = ctx.tokenizer; diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/ByteParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/ByteParser.java index d00c8dccfc8..a228e617ceb 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/ByteParser.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/ByteParser.java @@ -4,6 +4,7 @@ import io.deephaven.csv.tokenization.RangeTests; import io.deephaven.csv.tokenization.Tokenizer; import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; public final class ByteParser extends ParserBase { public static final ByteParser INSTANCE = new ByteParser(); @@ -11,12 +12,12 @@ public final class ByteParser extends ParserBase { private ByteParser() {} @Override - public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) { + public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) throws CsvReaderException { return twoPhaseDriver(ctx, ih, ihAlt, ByteParser::tryParseHelper, ctx.sinkFactory::makeByteSink); } private static boolean tryParseHelper(ParseContext ctx, IteratorHolder ih, - Sink sink, long current, long end) { + Sink sink, long current, long end) throws CsvReaderException { final byte[] chunk = new byte[DEST_BLOCK_SIZE]; final long[] longHolder = new long[1]; final Tokenizer t = ctx.tokenizer; diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/CharParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/CharParser.java index d73b5bad24b..ea10ad5355c 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/CharParser.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/CharParser.java @@ -2,6 +2,7 @@ import io.deephaven.csv.sinks.Sink; import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; public final class CharParser extends ParserBase { public static final CharParser INSTANCE = new CharParser(); @@ -10,14 +11,14 @@ private CharParser() { } @Override - public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) { + public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) throws CsvReaderException { return twoPhaseDriver(ctx, ih, ihAlt, CharParser::tryParseHelper, ctx.sinkFactory::makeCharSink); } private static boolean tryParseHelper(ParseContext ctx, IteratorHolder ih, Sink sink, - long current, long end) { + long current, long end) throws CsvReaderException { if (!ctx.isNullOrWidthOneSoFar) { return false; } diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/DateTimeAsLongParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/DateTimeAsLongParser.java index 24bc8a798fc..d25131d0176 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/DateTimeAsLongParser.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/DateTimeAsLongParser.java @@ -3,6 +3,7 @@ import io.deephaven.csv.sinks.Sink; import io.deephaven.csv.tokenization.Tokenizer; import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; public final class DateTimeAsLongParser extends ParserBase { public static final DateTimeAsLongParser INSTANCE = new DateTimeAsLongParser(); @@ -10,13 +11,13 @@ public final class DateTimeAsLongParser extends ParserBase { private DateTimeAsLongParser() {} @Override - public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) { + public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) throws CsvReaderException { return twoPhaseDriver(ctx, ih, ihAlt, DateTimeAsLongParser::tryParseHelper, ctx.sinkFactory::makeDateTimeAsLongSink); } private static boolean tryParseHelper(ParseContext ctx, IteratorHolder ih, - Sink sink, long current, long end) { + Sink sink, long current, long end) throws CsvReaderException { final long[] chunk = new long[DEST_BLOCK_SIZE]; final long[] dateTimeAsLongHolder = new long[1]; final Tokenizer t = ctx.tokenizer; diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/DoubleParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/DoubleParser.java index 151175b2669..6d1432e1bd5 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/DoubleParser.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/DoubleParser.java @@ -3,6 +3,7 @@ import io.deephaven.csv.sinks.Sink; import io.deephaven.csv.tokenization.Tokenizer; import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; public final class DoubleParser extends ParserBase { public static final DoubleParser INSTANCE = new DoubleParser(); @@ -10,12 +11,12 @@ public final class DoubleParser extends ParserBase { private DoubleParser() {} @Override - public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) { + public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) throws CsvReaderException { return twoPhaseDriver(ctx, ih, ihAlt, DoubleParser::tryParseHelper, ctx.sinkFactory::makeDoubleSink); } private static boolean tryParseHelper(ParseContext ctx, IteratorHolder ih, Sink sink, - long current, long end) { + long current, long end) throws CsvReaderException { final double[] chunk = new double[DEST_BLOCK_SIZE]; final double[] doubleHolder = new double[1]; final Tokenizer t = ctx.tokenizer; diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/FloatParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/FloatParser.java index 54d8efc857a..4eef2adc1ae 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/FloatParser.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/FloatParser.java @@ -4,6 +4,7 @@ import io.deephaven.csv.tokenization.Tokenizer; import io.deephaven.csv.parsers.context.ParseContext; import io.deephaven.csv.tokenization.RangeTests; +import io.deephaven.csv.util.CsvReaderException; public final class FloatParser extends ParserBase { public static final FloatParser INSTANCE = new FloatParser(); @@ -11,12 +12,12 @@ public final class FloatParser extends ParserBase { private FloatParser() {} @Override - public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) { + public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) throws CsvReaderException { return twoPhaseDriver(ctx, ih, ihAlt, FloatParser::tryParseFloatsHelper, ctx.sinkFactory::makeFloatSink); } private static boolean tryParseFloatsHelper(ParseContext ctx, IteratorHolder ih, - Sink sink, long current, long end) { + Sink sink, long current, long end) throws CsvReaderException { if (!ctx.hasFewerThan8SigFigsSoFar) { return false; } diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/IntParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/IntParser.java index 5903f8efdbf..e8915ffde20 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/IntParser.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/IntParser.java @@ -4,6 +4,7 @@ import io.deephaven.csv.sinks.Sink; import io.deephaven.csv.tokenization.RangeTests; import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.util.CsvReaderException; public final class IntParser extends ParserBase { public static final IntParser INSTANCE = new IntParser(); @@ -11,13 +12,13 @@ public final class IntParser extends ParserBase { private IntParser() {} @Override - public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) { + public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) throws CsvReaderException { return twoPhaseDriver(ctx, ih, ihAlt, IntParser::tryParseHelper, ctx.sinkFactory::makeIntSink); } private static boolean tryParseHelper(ParseContext ctx, IteratorHolder ih, Sink sink, - long current, long end) { + long current, long end) throws CsvReaderException { final int[] chunk = new int[DEST_BLOCK_SIZE]; final long[] longHolder = new long[1]; final Tokenizer t = ctx.tokenizer; diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/IteratorHolder.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/IteratorHolder.java index 74f84bfd541..153ebcf04d7 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/IteratorHolder.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/IteratorHolder.java @@ -3,6 +3,7 @@ import io.deephaven.csv.densestorage.DenseStorageReader; import io.deephaven.csv.containers.ByteSlice; import io.deephaven.csv.containers.CharSlice; +import io.deephaven.csv.util.CsvReaderException; public final class IteratorHolder { private final DenseStorageReader dsr; @@ -38,7 +39,7 @@ public IteratorHolder(DenseStorageReader dsr) { this.dsr = dsr; } - public boolean tryMoveNext() { + public boolean tryMoveNext() throws CsvReaderException { if (!dsr.tryGetNextSlice(bs, cs, booleanHolder)) { isExhausted = true; return false; @@ -48,11 +49,11 @@ public boolean tryMoveNext() { return true; } - public void mustMoveNext() { + public void mustMoveNext() throws CsvReaderException { if (tryMoveNext()) { return; } - throw new RuntimeException("Iteration ended unexpectedly."); + throw new CsvReaderException("Iteration ended unexpectedly."); } public String sliceToString() { diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/LongParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/LongParser.java index cca2b2d03c6..f7c1d3ce811 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/LongParser.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/LongParser.java @@ -3,6 +3,7 @@ import io.deephaven.csv.sinks.Sink; import io.deephaven.csv.tokenization.Tokenizer; import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; public final class LongParser extends ParserBase { public static final LongParser INSTANCE = new LongParser(); @@ -10,12 +11,12 @@ public final class LongParser extends ParserBase { private LongParser() { } - public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) { + public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) throws CsvReaderException { return twoPhaseDriver(ctx, ih, ihAlt, LongParser::tryParseHelper, ctx.sinkFactory::makeLongSink); } private static boolean tryParseHelper(ParseContext ctx, IteratorHolder ih, - Sink sink, long current, long end) { + Sink sink, long current, long end) throws CsvReaderException { final long[] chunk = new long[DEST_BLOCK_SIZE]; final long[] longHolder = new long[1]; final Tokenizer t = ctx.tokenizer; diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/ParserBase.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/ParserBase.java index f7926e0a402..4af6bd4b6f7 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/ParserBase.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/ParserBase.java @@ -2,16 +2,18 @@ import io.deephaven.csv.parsers.context.ParseContext; import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.util.CsvReaderException; import java.util.function.Supplier; public abstract class ParserBase { protected static final int DEST_BLOCK_SIZE = 65536; - public abstract Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt); + public abstract Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) throws CsvReaderException; protected Sink twoPhaseDriver(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt, - ParseHelperInvoker parseHelperInvoker, Supplier> sinkFactory) { + ParseHelperInvoker parseHelperInvoker, + Supplier> sinkFactory) throws CsvReaderException { if (ih.isExhausted()) { // Column contains all nulls (or is empty) final Sink sink = sinkFactory.get(); @@ -36,23 +38,21 @@ protected Sink twoPhaseDriver(ParseContext ctx, IteratorHolder ih, Itera ihAlt.mustMoveNext(); if (!parseHelperInvoker.apply(ctx, ihAlt, lazySink, 0, startRow)) { - // TODO(kosak): better error here - throw new RuntimeException("Parse failed on rescan"); + throw new CsvReaderException("Parse unxpectedly failed on second pass through the input."); } return lazySink.inner; } - protected static T assertHasNullValue(T boxedValue) { + protected static T assertHasNullValue(T boxedValue) throws CsvReaderException { if (boxedValue != null) { return boxedValue; } - // TODO(kosak): better exception. - throw new RuntimeException("Encountered a null cell but no null value was configured"); + throw new CsvReaderException("Encountered a null cell but no null value was configured"); } protected interface ParseHelperInvoker { boolean apply(ParseContext ctx, IteratorHolder ih, Sink workingSink, - long current, long end); + long current, long end) throws CsvReaderException; } private static final class Lazy implements Sink { diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/ShortParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/ShortParser.java index c98f7a8527b..d0ddce5c348 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/ShortParser.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/ShortParser.java @@ -4,6 +4,7 @@ import io.deephaven.csv.tokenization.RangeTests; import io.deephaven.csv.tokenization.Tokenizer; import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; public final class ShortParser extends ParserBase { public static final ShortParser INSTANCE = new ShortParser(); @@ -12,12 +13,12 @@ private ShortParser() { } @Override - public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) { + public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) throws CsvReaderException { return twoPhaseDriver(ctx, ih, ihAlt, ShortParser::tryParseHelper, ctx.sinkFactory::makeShortSink); } private static boolean tryParseHelper( - ParseContext ctx, IteratorHolder ih, Sink sink, long current, long end) { + ParseContext ctx, IteratorHolder ih, Sink sink, long current, long end) throws CsvReaderException { final short[] chunk = new short[DEST_BLOCK_SIZE]; final long[] longHolder = new long[1]; final Tokenizer t = ctx.tokenizer; diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/StringParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/StringParser.java index 213ac00af18..b3f8d857f1c 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/StringParser.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/StringParser.java @@ -2,6 +2,7 @@ import io.deephaven.csv.sinks.Sink; import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; public final class StringParser extends ParserBase { public static final StringParser INSTANCE = new StringParser(); @@ -9,12 +10,12 @@ public final class StringParser extends ParserBase { private StringParser() {} @Override - public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) { + public Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) throws CsvReaderException { return twoPhaseDriver(ctx, ih, ihAlt, StringParser::tryParseStringsHelper, ctx.sinkFactory::makeStringSink); } private static boolean tryParseStringsHelper(ParseContext ctx, IteratorHolder ih, - Sink sink, long current, long end) { + Sink sink, long current, long end) throws CsvReaderException { final String[] chunk = new String[DEST_BLOCK_SIZE]; final String[] nullValue = ctx.sentinelConfiguration.nullStringValue; diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampParserBase.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampParserBase.java index 258c0ec9031..b53eb6ecd44 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampParserBase.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampParserBase.java @@ -3,6 +3,7 @@ import io.deephaven.csv.sinks.Sink; import io.deephaven.csv.tokenization.Tokenizer; import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; public abstract class TimestampParserBase extends ParserBase { protected static final long SECOND_SCALE = 1_000_000_000; @@ -24,12 +25,12 @@ protected TimestampParserBase(long scale) { } @Override - public final Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) { + public final Sink tryParse(ParseContext ctx, IteratorHolder ih, IteratorHolder ihAlt) throws CsvReaderException { return twoPhaseDriver(ctx, ih, ihAlt, this::tryParseHelper, ctx.sinkFactory::makeDateTimeAsLongSink); } private boolean tryParseHelper(ParseContext ctx, IteratorHolder ih, - Sink sink, long current, long end) { + Sink sink, long current, long end) throws CsvReaderException { final long[] chunk = new long[DEST_BLOCK_SIZE]; final long[] longHolder = new long[1]; final Tokenizer t = ctx.tokenizer; diff --git a/extensions/csv/src/main/java/io/deephaven/csv/reading/ParseDenseStorageToColumn.java b/extensions/csv/src/main/java/io/deephaven/csv/reading/ParseDenseStorageToColumn.java index 5d462bfc6bc..ab059b6166c 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/reading/ParseDenseStorageToColumn.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/reading/ParseDenseStorageToColumn.java @@ -8,6 +8,7 @@ import io.deephaven.csv.sinks.Sink; import io.deephaven.csv.sinks.SinkFactory; import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.util.CsvReaderException; import io.deephaven.csv.util.Renderer; import io.deephaven.csv.parsers.context.ParseContext; @@ -17,7 +18,7 @@ public final class ParseDenseStorageToColumn { public static Sink doit(DenseStorageReader dsr, DenseStorageReader dsrAlt, Set> parsers, ParserBase nullParser, - SentinelConfiguration nullContext, SinkFactory sinkFactory) { + SentinelConfiguration nullContext, SinkFactory sinkFactory) throws CsvReaderException { final ParseDenseStorageToColumn pdsc = new ParseDenseStorageToColumn(dsr, dsrAlt, parsers, nullParser, nullContext, sinkFactory); return pdsc.run(); @@ -60,7 +61,7 @@ private ParseDenseStorageToColumn( this.sinkFactory = sinkFactory; } - private Sink run() { + private Sink run() throws CsvReaderException { List> parsersToTry; if (parsers == null) { parsersToTry = Parsers.DEFAULT; @@ -90,14 +91,14 @@ private Sink run() { if (columnIsAllNulls && parsersToTry.size() != 1) { if (nullParser == null) { - throw new RuntimeException( + throw new CsvReaderException( "Column contains all null cells: can't infer type of column, and nullParser is not set."); } parsersToTry = List.of(nullParser); } if (parsersToTry.size() == 0) { - throw new RuntimeException("No parsers available to try."); + throw new CsvReaderException("No parsers available to try."); } final IteratorHolder ihAlt = new IteratorHolder(dsrAlt); @@ -108,7 +109,7 @@ private Sink run() { } } - throw new RuntimeException(String.format("Tried %d parsers, none succeeded. Parsers were: %s", + throw new CsvReaderException(String.format("Tried %d parsers, none succeeded. Parsers were: %s", parsersToTry.size(), Renderer.renderList(parsersToTry, ", ", p -> p.getClass().getName()))); }