diff --git a/ClientSupport/src/test/java/io/deephaven/treetable/SnapshotStateTest.java b/ClientSupport/src/test/java/io/deephaven/treetable/SnapshotStateTest.java index 7d24dd9e55b..9c288815770 100644 --- a/ClientSupport/src/test/java/io/deephaven/treetable/SnapshotStateTest.java +++ b/ClientSupport/src/test/java/io/deephaven/treetable/SnapshotStateTest.java @@ -1,6 +1,7 @@ package io.deephaven.treetable; import io.deephaven.csv.CsvTools; +import io.deephaven.csv.util.CsvReaderException; import io.deephaven.engine.table.Table; import io.deephaven.engine.table.lang.QueryLibrary; import io.deephaven.engine.util.TableTools; @@ -9,7 +10,6 @@ import io.deephaven.engine.table.impl.QueryTableTestBase; import org.junit.Test; -import java.io.IOException; import java.util.BitSet; import java.util.HashMap; import java.util.HashSet; @@ -18,7 +18,7 @@ import static io.deephaven.treetable.TreeTableConstants.ROOT_TABLE_KEY; public class SnapshotStateTest extends QueryTableTestBase { - private static Table getRawNyMunis() throws IOException { + private static Table getRawNyMunis() throws CsvReaderException { QueryLibrary.importStatic(TreeSnapshotQueryTest.StaticHolder.class); final BaseTable base = @@ -33,12 +33,12 @@ private static Table makeNyMunisTreeTableFrom(Table t) { return t.treeTable("Path", "Direct"); } - private static Table makeNyMunisTreeTable() throws IOException { + private static Table makeNyMunisTreeTable() throws CsvReaderException { return makeNyMunisTreeTableFrom(getRawNyMunis()); } @Test - public void testBounds() throws IOException { + public void testBounds() throws CsvReaderException { final HierarchicalTable treeTable = (HierarchicalTable) makeNyMunisTreeTable(); final Map details = new HashMap<>(); diff --git a/ClientSupport/src/test/java/io/deephaven/treetable/TreeSnapshotQueryTest.java b/ClientSupport/src/test/java/io/deephaven/treetable/TreeSnapshotQueryTest.java index d375b2dd177..84ad346f1af 100644 --- a/ClientSupport/src/test/java/io/deephaven/treetable/TreeSnapshotQueryTest.java +++ b/ClientSupport/src/test/java/io/deephaven/treetable/TreeSnapshotQueryTest.java @@ -2,6 +2,7 @@ import io.deephaven.base.Pair; import io.deephaven.csv.CsvTools; +import io.deephaven.csv.util.CsvReaderException; import io.deephaven.datastructures.util.SmartKey; import io.deephaven.engine.table.ColumnDefinition; import io.deephaven.engine.table.Table; @@ -21,7 +22,6 @@ import gnu.trove.map.hash.TIntObjectHashMap; import gnu.trove.map.hash.TObjectIntHashMap; -import java.io.IOException; import java.util.*; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -183,7 +183,7 @@ public static List removeEmpty(String... components) { } } - private static Table getRawNyMunis() throws IOException { + private static Table getRawNyMunis() throws CsvReaderException { QueryLibrary.importStatic(StaticHolder.class); final BaseTable base = @@ -202,7 +202,7 @@ private static Table getRawNyMunis() throws IOException { .lastBy("Path"); } - private static Table makeNyMunisTreeTable() throws IOException { + private static Table makeNyMunisTreeTable() throws CsvReaderException { return makeNyMunisTreeTableFrom(getRawNyMunis()); } @@ -218,7 +218,7 @@ private static List munisKey(String... path) { // region Actual Tests - public void testTsq() throws IOException { + public void testTsq() throws CsvReaderException { final Table t = makeNyMunisTreeTable(); final TTState state = new TTState(t); final BitSet allColumns = new BitSet(t.getColumns().length); @@ -456,7 +456,7 @@ public void testTsq() throws IOException { assertFalse(state.expansionMap.containsKey(mayfieldKey)); } - public void testSortandFilter() throws IOException { + public void testSortandFilter() throws CsvReaderException { final Table t = makeNyMunisTreeTable(); final TTState state = new TTState(t); final BitSet allColumns = new BitSet(t.getColumns().length); diff --git a/Integrations/python/deephaven/csv.py b/Integrations/python/deephaven/csv.py index 6fb308883c3..153d8b52bd8 100644 --- a/Integrations/python/deephaven/csv.py +++ b/Integrations/python/deephaven/csv.py @@ -116,7 +116,7 @@ def read(path: str, Args: path (str): a file path or a URL string header (Dict[str, DataType]): a dict to define the table columns with key being the name, value being the data type - inference (csv.Inference): an Enum value specifying the rules for data type inference, default is INFERENCE_STANDARD_TIMES + inference (csv.Inference): an Enum value specifying the rules for data type inference, default is INFERENCE_STANDARD headless (bool): indicates if the CSV data is headless, default is False delimiter (str): the delimiter used by the CSV, default is the comma quote (str): the quote character for the CSV, default is double quote @@ -133,7 +133,7 @@ def read(path: str, """ if inference is None: - inference = INFERENCE_STANDARD_TIMES + inference = INFERENCE_STANDARD csv_specs_builder = _JCsvSpecs.builder() diff --git a/Integrations/python/test/data/test_csv.csv b/Integrations/python/test/data/test_csv.csv index c0fb59936d4..220e535b79a 100644 --- a/Integrations/python/test/data/test_csv.csv +++ b/Integrations/python/test/data/test_csv.csv @@ -3,6 +3,6 @@ String,Long,Float a ,9223372036854775807, b,,-Infinity c,-9223372036854775807,NaN -"d 'c " ,9999999,3.4028235e+38f +"d 'c " ,9999999,3.4028234e+38f null,-0,1.17549435E-38f "null",0,1.4e-45f diff --git a/engine/table/src/main/java/io/deephaven/engine/table/impl/InMemoryTable.java b/engine/table/src/main/java/io/deephaven/engine/table/impl/InMemoryTable.java index 008ebd9bef8..898c977ae82 100644 --- a/engine/table/src/main/java/io/deephaven/engine/table/impl/InMemoryTable.java +++ b/engine/table/src/main/java/io/deephaven/engine/table/impl/InMemoryTable.java @@ -39,6 +39,11 @@ public static InMemoryTable from(NewTable table) { columns); } + public static InMemoryTable from(TableDefinition definition, TrackingRowSet rowSet, + Map> columns) { + return new InMemoryTable(definition, rowSet, columns); + } + public InMemoryTable(String[] columnNames, Object[] arrayValues) { super(RowSetFactory.flat(Array.getLength(arrayValues[0])).toTracking(), createColumnsMap(columnNames, arrayValues)); diff --git a/extensions/csv/src/main/java/io/deephaven/csv/CsvSpecs.java b/extensions/csv/src/main/java/io/deephaven/csv/CsvSpecs.java index bb21dbf24b9..bacb9f7a486 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/CsvSpecs.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/CsvSpecs.java @@ -1,34 +1,45 @@ package io.deephaven.csv; +import gnu.trove.map.hash.TIntObjectHashMap; import io.deephaven.annotations.BuildableStyle; -import io.deephaven.api.util.NameValidator; -import io.deephaven.qst.array.Array; -import io.deephaven.qst.array.ArrayBuilder; +import io.deephaven.chunk.*; +import io.deephaven.chunk.attributes.Values; +import io.deephaven.csv.containers.ByteSlice; +import io.deephaven.csv.parsers.Parser; +import io.deephaven.csv.parsers.Parsers; +import io.deephaven.csv.reading.CsvReader; +import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.sinks.SinkFactory; +import io.deephaven.csv.tokenization.RangeTests; +import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.util.CsvReaderException; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.rowset.RowSequenceFactory; +import io.deephaven.engine.rowset.RowSetFactory; +import io.deephaven.engine.rowset.TrackingRowSet; +import io.deephaven.engine.table.*; +import io.deephaven.engine.table.impl.InMemoryTable; +import io.deephaven.engine.table.impl.sources.*; +import io.deephaven.qst.column.header.ColumnHeader; import io.deephaven.qst.table.NewTable; import io.deephaven.qst.table.TableHeader; -import io.deephaven.qst.type.Type; -import org.apache.commons.csv.CSVFormat; -import org.apache.commons.csv.CSVParser; -import org.apache.commons.csv.CSVRecord; +import io.deephaven.qst.type.*; +import io.deephaven.time.DateTime; +import io.deephaven.time.TimeZone; +import io.deephaven.util.BooleanUtils; +import io.deephaven.util.QueryConstants; +import org.apache.commons.lang3.mutable.MutableLong; +import org.apache.commons.lang3.mutable.MutableObject; import org.immutables.value.Value.Default; import org.immutables.value.Value.Immutable; -import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.IntStream; +import java.time.ZoneId; +import java.util.*; /** * A specification object for parsing a CSV, or CSV-like, structure into a {@link NewTable}. @@ -38,10 +49,17 @@ public abstract class CsvSpecs { public interface Builder { - Builder header(TableHeader header); - Builder putParsers(String columnName, Parser parser); + Builder putParserForName(String columnName, Parser parser); + + Builder putParserForIndex(int index, Parser parser); + + Builder nullValueLiteral(String nullValueLiteral); + + Builder putNullValueLiteralForName(String columnName, String nullValueLiteral); + + Builder putNullValueLiteralForIndex(int index, String nullValueLiteral); Builder inference(InferenceSpecs inferenceSpecs); @@ -57,6 +75,8 @@ public interface Builder { Builder charset(Charset charset); + Builder concurrent(boolean async); + CsvSpecs build(); } @@ -137,30 +157,63 @@ public static CsvSpecs fromLegacyFormat(String format) { * A header, when specified, hints at the parser to use. * *

- * To be even more explicit, callers may also use {@link #parsers()}. + * To be even more explicit, callers may also use {@link #parserForName()} or {@link #parserForIndex()}. * - * @return the table header + * @return the table header. */ public abstract Optional header(); /** * The parsers, where the keys are column names. Specifying a parser for a column forgoes inference for that column. * - * @return the parsers + * @return the parsers. + */ + public abstract Map> parserForName(); + + /** + * The parsers, where the keys are 1-based column indices. Specifying a parser for a column forgoes inference for + * that column. + * + * @return the parsers. + */ + public abstract Map> parserForIndex(); + + + /** + * The null value literal that is used when it is not overridden for any particular column. + */ + @Default + public String nullValueLiteral() { + return ""; + } + + /** + * The null value literals, where the keys are column names. Specifying a null value literal for a column overrides + * the default null value literal, which is the empty string. + * + * @return the null value literals + */ + public abstract Map nullValueLiteralForName(); + + /** + * The null value literals, where the keys are 1-based column indices. Specifying a null value literal for a column + * overrides the default null value literal, which is the empty string. + * + * @return the null value literals */ - public abstract Map> parsers(); + public abstract Map nullValueLiteralForIndex(); /** * The inference specifications. * *

- * By default, is {@link InferenceSpecs#standardTimes()}. + * By default, is {@link InferenceSpecs#standard()}. * * @return the inference specifications */ @Default public InferenceSpecs inference() { - return InferenceSpecs.standardTimes(); + return InferenceSpecs.standard(); } /** @@ -247,12 +300,14 @@ public Charset charset() { return StandardCharsets.UTF_8; } - private CSVFormat format() { - return CSVFormat.DEFAULT - .withIgnoreSurroundingSpaces(ignoreSurroundingSpaces()) - .withDelimiter(delimiter()) - .withQuote(quote()) - .withTrim(trim()); + /** + * Should the CSVReader run its processing steps concurrently on multiple threads for better performance. + * + * @return the concurrent flag + */ + @Default + public boolean concurrent() { + return true; } /** @@ -264,9 +319,9 @@ private CSVFormat format() { * * @param stream the stream * @return the new table - * @throws IOException if an I/O exception occurs + * @throws CsvReaderException if any sort of failure occurs. */ - public final NewTable parse(InputStream stream) throws IOException { + public final Table parse(InputStream stream) throws CsvReaderException { return parse(new InputStreamReader(stream, charset())); } @@ -279,132 +334,418 @@ public final NewTable parse(InputStream stream) throws IOException { * * @param reader the reader * @return the new table - * @throws IOException if an I/O exception occurs + * @throws CsvReaderException If any sort of failure occurs. */ - public final NewTable parse(Reader reader) throws IOException { - try ( - final CSVParser csvParser = format().parse(reader)) { - final List records = csvParser.getRecords(); - if (hasHeaderRow() && records.isEmpty()) { - throw new IllegalStateException("Expected header row, none found"); - } - final List dataRecords = hasHeaderRow() ? records.subList(1, records.size()) : records; - if (!header().isPresent() && dataRecords.isEmpty()) { - throw new IllegalStateException("Unable to infer types with no TableHeader and no data"); - } - final int numColumns = records.get(0).size(); - if (numColumns == 0) { - throw new IllegalStateException("Unable to parse an empty CSV"); + public final Table parse(Reader reader) throws CsvReaderException { + final CsvReader csvReader = configureCsvReader(); + final CsvReader.Result result = csvReader.read(reader, new MySinkFactory()); + + final String[] columnNames = result.columnNames(); + final Sink[] sinks = result.columns(); + final Map> columns = new LinkedHashMap<>(); + long maxSize = 0; + for (int ii = 0; ii < columnNames.length; ++ii) { + final String columnName = columnNames[ii]; + final MySinkBase sink = (MySinkBase) sinks[ii]; + maxSize = Math.max(maxSize, sink.resultSize()); + columns.put(columnName, sink.result()); + } + final TableDefinition tableDef = TableDefinition.inferFrom(columns); + final TrackingRowSet rowSet = RowSetFactory.flat(maxSize).toTracking(); + return InMemoryTable.from(tableDef, rowSet, columns); + } + + private CsvReader configureCsvReader() { + final CsvReader csvReader = new CsvReader(); + + csvReader.setConcurrent(concurrent()); + csvReader.setIgnoreSurroundingSpaces(ignoreSurroundingSpaces()); + csvReader.setTrim(trim()); + csvReader.setHasHeaders(hasHeaderRow()); + csvReader.setquoteChar(quote()); + csvReader.setFieldDelimiter(delimiter()); + csvReader.setParsers(inference().parsers()); + + for (Map.Entry> entry : parserForName().entrySet()) { + csvReader.setParserFor(entry.getKey(), entry.getValue()); + } + for (Map.Entry> entry : parserForIndex().entrySet()) { + csvReader.setParserFor(entry.getKey(), entry.getValue()); + } + + + csvReader.setNullValueLiteral(nullValueLiteral()); + for (Map.Entry entry : nullValueLiteralForName().entrySet()) { + csvReader.setNullValueLiteralFor(entry.getKey(), entry.getValue()); + } + for (Map.Entry entry : nullValueLiteralForIndex().entrySet()) { + csvReader.setNullValueLiteralFor(entry.getKey(), entry.getValue()); + } + + if (header().isPresent()) { + final List headers = new ArrayList<>(); + for (ColumnHeader ch : header().get()) { + headers.add(ch.name()); + csvReader.setParserFor(ch.name(), typeToParser(ch.componentType())); } - final Iterable columnNames; - if (header().isPresent()) { - columnNames = header().get().columnNames(); - } else if (hasHeaderRow()) { - columnNames = legalizeColumnNames(records.get(0)); + csvReader.setHeaders(headers); + } + + + csvReader.setNullParser(inference().nullParser()); + + csvReader.setCustomTimeZoneParser(new TimeZoneParser()); + + csvReader.setNullBooleanAsByteValue(BooleanUtils.NULL_BOOLEAN_AS_BYTE) + .setNullByteValue(QueryConstants.NULL_BYTE) + .setNullShortValue(QueryConstants.NULL_SHORT) + .setNullIntValue(QueryConstants.NULL_INT) + .setNullLongValue(QueryConstants.NULL_LONG) + .setNullFloatValue(QueryConstants.NULL_FLOAT) + .setNullDoubleValue(QueryConstants.NULL_DOUBLE) + .setNullCharValue(QueryConstants.NULL_CHAR) + .setNullStringValue(null) + .setNullDateTimeAsLongValue(QueryConstants.NULL_LONG) + .setNullTimestampAsLongValue(QueryConstants.NULL_LONG); + + return csvReader; + } + + private static abstract class MySinkBase implements Sink { + private final ArrayBackedColumnSource result; + private final WritableColumnSource reinterpreted; + private final ChunkWrapInvoker chunkWrapInvoker; + private long resultSize = 0; + + public MySinkBase(ArrayBackedColumnSource result, Class interpClass, + ChunkWrapInvoker chunkWrapInvoker) { + this.result = result; + if (interpClass != null) { + reinterpreted = (WritableColumnSource) result.reinterpret(interpClass); } else { - columnNames = IntStream - .range(0, numColumns) - .mapToObj(i -> String.format("Column%d", i + 1)) - .collect(Collectors.toList()); + reinterpreted = result; } + this.chunkWrapInvoker = chunkWrapInvoker; + } - final NewTable.Builder table = NewTable.builder(); - int columnIndex = 0; - int size = -1; - for (String columnName : columnNames) { - final Parser parser = parser(columnName, columnIndex, dataRecords); - final Array array = buildArray(getColumn(columnIndex, dataRecords), parser, dataRecords.size()); - if (size == -1) { - size = array.size(); - } - table.putColumns(columnName, array); - ++columnIndex; + @Override + public final void write(TARRAY src, int srcOffset, long destOffset, int size, boolean appending_unused) { + if (size == 0) { + return; } - return table.size(size).build(); + final long requiredCapacity = destOffset + size; + reinterpreted.ensureCapacity(requiredCapacity); + resultSize = Math.max(resultSize, requiredCapacity); + try (final ChunkSink.FillFromContext context = reinterpreted.makeFillFromContext(size); + final RowSequence range = RowSequenceFactory.forRange(destOffset, destOffset + size - 1)) { + Chunk chunk = chunkWrapInvoker.apply(src, srcOffset, size); + reinterpreted.fillFromChunk(context, chunk, range); + } + } + + private interface ChunkWrapInvoker { + Chunk apply(TARRAY data, int offset, int capacity); + } + + public ArrayBackedColumnSource result() { + return result; + } + + public long resultSize() { + return resultSize; } } - private static List legalizeColumnNames(CSVRecord record) { - final Set taken = new HashSet<>(record.size()); - final List out = new ArrayList<>(record.size()); - for (String name : record) { - out.add(NameValidator.legalizeColumnName(name, (s) -> s.replaceAll("[- ]", "_"), taken)); + private static final class MyCharSink extends MySinkBase { + public MyCharSink() { + super(new CharacterArraySource(), null, CharChunk::chunkWrap); } - return out; } - private static Type type(TableHeader header, String columnName) { - final Type type = header.getHeader(columnName); - if (type != null) { - return type; + private static final class MyBooleanAsByteSink extends MySinkBase { + public MyBooleanAsByteSink() { + super(new BooleanArraySource(), byte.class, ByteChunk::chunkWrap); } - throw new IllegalArgumentException(String.format( - "When specifying a header, all columns must be accounted for. Missing type for column name '%s'", - columnName)); } - private Parser parser(String columnName, int columnIndex, List dataRecords) { - final Type type = header().map(header -> type(header, columnName)).orElse(null); + private static final class MyByteSink extends MySinkBase { + public MyByteSink() { + super(new ByteArraySource(), null, ByteChunk::chunkWrap); + } + } - // 1. An explicit parser if set - final Parser explicit = parsers().get(columnName); - if (explicit != null) { - if (type != null && !type.equals(explicit.type())) { - throw new IllegalArgumentException("Explicit parser type and column header type do not match"); - } - return explicit; + private static final class MyShortSink extends MySinkBase { + public MyShortSink() { + super(new ShortArraySource(), null, ShortChunk::chunkWrap); } + } + + private static final class MyIntSink extends MySinkBase { + public MyIntSink() { + super(new IntegerArraySource(), null, IntChunk::chunkWrap); + } + } - final InferenceSpecs inference; - if (type != null) { - // 2. Guided inference - inference = inference().limitToType(type); - } else { - // 3. Original inference - inference = inference(); + private static final class MyLongSink extends MySinkBase { + public MyLongSink() { + super(new LongArraySource(), null, LongChunk::chunkWrap); } + } + + private static final class MyFloatSink extends MySinkBase { + public MyFloatSink() { + super(new FloatArraySource(), null, FloatChunk::chunkWrap); + } + } + + private static final class MyDoubleSink extends MySinkBase { + public MyDoubleSink() { + super(new DoubleArraySource(), null, DoubleChunk::chunkWrap); + } + } - final Optional> p = inference.infer(getColumn(columnIndex, dataRecords)); - if (!p.isPresent()) { - throw new IllegalStateException( - String.format("Unable to infer type for column '%s'", columnName)); + private static final class MyStringSink extends MySinkBase { + public MyStringSink() { + super(new ObjectArraySource<>(String.class), null, ObjectChunk::chunkWrap); } - return p.get(); } - private static Array buildArray(Iterator it, Parser parser, int size) { - final ArrayBuilder builder = Array.builder(parser.type(), size); - while (it.hasNext()) { - final T item = parser.parse(it.next()); - builder.add(item); + private static final class MyDateTimeAsLongSink extends MySinkBase { + public MyDateTimeAsLongSink() { + super(new DateTimeArraySource(), long.class, LongChunk::chunkWrap); + } + } + + private static class MySinkFactory implements SinkFactory { + @Override + public Sink makeBooleanAsByteSink() { + return new MyBooleanAsByteSink(); + } + + @Override + public Sink makeByteSink() { + return new MyByteSink(); + } + + @Override + public Sink makeShortSink() { + return new MyShortSink(); + } + + @Override + public Sink makeIntSink() { + return new MyIntSink(); + } + + @Override + public Sink makeLongSink() { + return new MyLongSink(); + } + + @Override + public Sink makeFloatSink() { + return new MyFloatSink(); + } + + @Override + public Sink makeDoubleSink() { + return new MyDoubleSink(); + } + + @Override + public Sink makeCharSink() { + return new MyCharSink(); + } + + @Override + public Sink makeStringSink() { + return new MyStringSink(); + } + + @Override + public Sink makeDateTimeAsLongSink() { + return new MyDateTimeAsLongSink(); + } + + @Override + public Sink makeTimestampAsLongSink() { + return new MyDateTimeAsLongSink(); } - return builder.build(); } - private static Iterator getColumn(int index, Iterable records) { - return new CsvColumnIterator(index, records.iterator()); + private static Parser typeToParser(Type type) { + return type.walk(new MyVisitor()).out; } - private static class CsvColumnIterator implements Iterator { - private final int index; - private final Iterator it; + private static final class MyVisitor implements Type.Visitor, PrimitiveType.Visitor, GenericType.Visitor { + private Parser out; + + @Override + public void visit(PrimitiveType primitiveType) { + primitiveType.walk((PrimitiveType.Visitor) this); + } + + @Override + public void visit(GenericType genericType) { + genericType.walk((GenericType.Visitor) this); + } + + @Override + public void visit(BooleanType booleanType) { + out = Parsers.BOOLEAN; + } + + @Override + public void visit(ByteType byteType) { + out = Parsers.BYTE; + } + + @Override + public void visit(CharType charType) { + out = Parsers.CHAR; + } + + @Override + public void visit(ShortType shortType) { + out = Parsers.SHORT; + } + + @Override + public void visit(IntType intType) { + out = Parsers.INT; + } - public CsvColumnIterator(int index, Iterator it) { - this.index = index; - this.it = Objects.requireNonNull(it); + @Override + public void visit(LongType longType) { + out = Parsers.LONG; } @Override - public boolean hasNext() { - return it.hasNext(); + public void visit(FloatType floatType) { + out = Parsers.FLOAT; + } + + @Override + public void visit(DoubleType doubleType) { + out = Parsers.DOUBLE; + } + + @Override + public void visit(StringType stringType) { + out = Parsers.STRING; + } + + @Override + public void visit(InstantType instantType) { + throw new RuntimeException("Logic error: there is no Parser for " + instantType); + } + + @Override + public void visit(ArrayType arrayType) { + throw new RuntimeException("Logic error: there is no Parser for " + arrayType); + } + + @Override + public void visit(CustomType customType) { + throw new RuntimeException("Logic error: there is no Parser for " + customType); + } + } + + /** + * A class that aids in Deephaven TimeZone parsing. In particular it memorizes the set of known Deephaven + * DateTimeZones and keeps them in a hashmap for fast lookup. It also remembers the last timezone looked up for even + * faster access. It is used as a callback for the Tokenizer class. + */ + private static final class TimeZoneParser implements Tokenizer.CustomTimeZoneParser { + private static final String DEEPHAVEN_TZ_PREFIX = "TZ_"; + private static final int MAX_DEEPHAVEN_TZ_LENGTH = 3; + + private final TIntObjectHashMap zoneIdMap = new TIntObjectHashMap<>(); + + private int lastTzKey = -1; + private ZoneId lastZoneId = null; + + public TimeZoneParser() { + for (TimeZone zone : TimeZone.values()) { + final String zname = zone.name(); + if (!zname.startsWith(DEEPHAVEN_TZ_PREFIX)) { + throw new RuntimeException("Logic error: unexpected enum in DBTimeZone: " + zname); + } + final String zSuffix = zname.substring(DEEPHAVEN_TZ_PREFIX.length()); + final int zlen = zSuffix.length(); + if (zlen > MAX_DEEPHAVEN_TZ_LENGTH) { + throw new RuntimeException("Logic error: unexpectedly-long enum in DBTimeZone: " + zname); + } + final byte[] data = new byte[zlen]; + for (int ii = 0; ii < zlen; ++ii) { + final char ch = zSuffix.charAt(ii); + if (!RangeTests.isUpper(ch)) { + throw new RuntimeException("Logic error: unexpected character in DBTimeZone name: " + zname); + } + data[ii] = (byte) ch; + } + final ByteSlice bs = new ByteSlice(data, 0, data.length); + final int tzKey = tryParseTzKey(bs); + if (tzKey < 0) { + throw new RuntimeException("Logic error: can't parse DBTimeZone as key: " + zname); + } + final ZoneId zoneId = zone.getTimeZone().toTimeZone().toZoneId(); + zoneIdMap.put(tzKey, zoneId); + } } @Override - public String next() { - CSVRecord next = it.next(); - String stringValue = next.get(index); - // treating empty string as null - return stringValue.isEmpty() ? null : stringValue; + public boolean tryParse(ByteSlice bs, MutableObject zoneId, MutableLong offsetSeconds) { + if (bs.size() == 0 || bs.front() != ' ') { + return false; + } + final int savedBegin = bs.begin(); + bs.setBegin(bs.begin() + 1); + final int tzKey = tryParseTzKey(bs); + if (tzKey < 0) { + bs.setBegin(savedBegin); + return false; + } + if (tzKey != lastTzKey) { + final ZoneId res = zoneIdMap.get(tzKey); + if (res == null) { + bs.setBegin(savedBegin); + return false; + } + lastTzKey = tzKey; + lastZoneId = res; + } + zoneId.setValue(lastZoneId); + offsetSeconds.setValue(0); + return true; + } + + /** + * Take up to three uppercase characters from a TimeZone string and pack them into an integer. + * + * @param bs A ByteSlice holding the timezone key. + * @return The characters packed into an int, or -1 if there are too many or too few characters, or if the + * characters are not uppercase ASCII. + */ + private static int tryParseTzKey(final ByteSlice bs) { + int res = 0; + int current; + for (current = bs.begin(); current != bs.end(); ++current) { + if (current - bs.begin() > MAX_DEEPHAVEN_TZ_LENGTH) { + return -1; + } + final char ch = RangeTests.toUpper((char) bs.data()[current]); + if (!RangeTests.isUpper(ch)) { + // If it's some nonalphabetic character + break; + } + res = res * 26 + (ch - 'A'); + } + if (current - bs.begin() == 0) { + return -1; + } + bs.setBegin(current); + return res; } } } diff --git a/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java b/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java index 0047267a157..0219b27e3ce 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java @@ -5,6 +5,8 @@ package io.deephaven.csv; import io.deephaven.base.Procedure; +import io.deephaven.csv.reading.CsvReader; +import io.deephaven.csv.util.CsvReaderException; import io.deephaven.datastructures.util.CollectionUtil; import io.deephaven.engine.table.DataColumn; import io.deephaven.engine.table.MatchPair; @@ -56,7 +58,7 @@ public class CsvTools { * @see #readCsv(String, CsvSpecs) */ @ScriptApi - public static Table readCsv(String path) throws IOException { + public static Table readCsv(String path) throws CsvReaderException { return readCsv(path, CsvSpecs.csv()); } @@ -70,7 +72,7 @@ public static Table readCsv(String path) throws IOException { * @see #readCsv(InputStream, CsvSpecs) */ @ScriptApi - public static Table readCsv(InputStream stream) throws IOException { + public static Table readCsv(InputStream stream) throws CsvReaderException { return readCsv(stream, CsvSpecs.csv()); } @@ -83,7 +85,7 @@ public static Table readCsv(InputStream stream) throws IOException { * @see #readCsv(URL, CsvSpecs) */ @ScriptApi - public static Table readCsv(URL url) throws IOException { + public static Table readCsv(URL url) throws CsvReaderException { return readCsv(url, CsvSpecs.csv()); } @@ -100,7 +102,7 @@ public static Table readCsv(URL url) throws IOException { * @see #readCsv(Path, CsvSpecs) */ @ScriptApi - public static Table readCsv(Path path) throws IOException { + public static Table readCsv(Path path) throws CsvReaderException { return readCsv(path, CsvSpecs.csv()); } @@ -122,7 +124,7 @@ public static Table readCsv(Path path) throws IOException { * @see #readCsv(Path, CsvSpecs) */ @ScriptApi - public static Table readCsv(String path, CsvSpecs specs) throws IOException { + public static Table readCsv(String path, CsvSpecs specs) throws CsvReaderException { URL: { final URL url; try { @@ -145,11 +147,11 @@ public static Table readCsv(String path, CsvSpecs specs) throws IOException { * @param stream the stream * @param specs the csv specs * @return the table - * @throws IOException if an I/O exception occurs + * @throws CsvReaderException If some error occurs. */ @ScriptApi - public static Table readCsv(InputStream stream, CsvSpecs specs) throws IOException { - return InMemoryTable.from(specs.parse(stream)); + public static Table readCsv(InputStream stream, CsvSpecs specs) throws CsvReaderException { + return specs.parse(stream); } /** @@ -158,11 +160,16 @@ public static Table readCsv(InputStream stream, CsvSpecs specs) throws IOExcepti * @param url the url * @param specs the csv specs * @return the table - * @throws IOException if an I/O exception occurs + * @throws CsvReaderException If some CSV reading error occurs. + * @throws IOException if the URL cannot be opened. */ @ScriptApi - public static Table readCsv(URL url, CsvSpecs specs) throws IOException { - return InMemoryTable.from(specs.parse(url.openStream())); + public static Table readCsv(URL url, CsvSpecs specs) throws CsvReaderException { + try { + return specs.parse(url.openStream()); + } catch (IOException inner) { + throw new CsvReaderException("Caught exception", inner); + } } /** @@ -175,12 +182,17 @@ public static Table readCsv(URL url, CsvSpecs specs) throws IOException { * @param path the path * @param specs the csv specs * @return the table + * @throws CsvReaderException If some CSV reading error occurs. * @throws IOException if an I/O exception occurs * @see PathUtil#open(Path) */ @ScriptApi - public static Table readCsv(Path path, CsvSpecs specs) throws IOException { - return InMemoryTable.from(specs.parse(PathUtil.open(path))); + public static Table readCsv(Path path, CsvSpecs specs) throws CsvReaderException { + try { + return specs.parse(PathUtil.open(path)); + } catch (IOException inner) { + throw new CsvReaderException("Caught exception", inner); + } } /** @@ -217,7 +229,7 @@ public static MatchPair[] renamesForHeaderless(String... columnNames) { * {@code CsvTools.readCsv(filePath, CsvSpecs.headerless()).renameColumns(renamesForHeaderless(columnNames));} */ @ScriptApi - public static Table readHeaderlessCsv(String filePath, Collection columnNames) throws IOException { + public static Table readHeaderlessCsv(String filePath, Collection columnNames) throws CsvReaderException { return CsvTools.readCsv(filePath, CsvSpecs.headerless()).renameColumns(renamesForHeaderless(columnNames)); } @@ -226,7 +238,7 @@ public static Table readHeaderlessCsv(String filePath, Collection column * {@code CsvTools.readCsv(filePath, CsvSpecs.headerless()).renameColumns(renamesForHeaderless(columnNames));} */ @ScriptApi - public static Table readHeaderlessCsv(String filePath, String... columnNames) throws IOException { + public static Table readHeaderlessCsv(String filePath, String... columnNames) throws CsvReaderException { return CsvTools.readCsv(filePath, CsvSpecs.headerless()).renameColumns(renamesForHeaderless(columnNames)); } @@ -243,7 +255,7 @@ public static Table readHeaderlessCsv(String filePath, String... columnNames) th */ @ScriptApi @Deprecated - public static Table readCsv(InputStream is, final String format) throws IOException { + public static Table readCsv(InputStream is, final String format) throws CsvReaderException { final CsvSpecs specs = CsvSpecs.fromLegacyFormat(format); if (specs == null) { throw new IllegalArgumentException(String.format("Unable to map legacy format '%s' into CsvSpecs", format)); @@ -263,8 +275,8 @@ public static Table readCsv(InputStream is, final String format) throws IOExcept */ @ScriptApi @Deprecated - public static Table readCsv(InputStream is, final char separator) throws IOException { - return InMemoryTable.from(CsvSpecs.builder().delimiter(separator).build().parse(is)); + public static Table readCsv(InputStream is, final char separator) throws CsvReaderException { + return CsvSpecs.builder().delimiter(separator).build().parse(is); } private static boolean isStandardFile(URL url) { diff --git a/extensions/csv/src/main/java/io/deephaven/csv/InferenceSpecs.java b/extensions/csv/src/main/java/io/deephaven/csv/InferenceSpecs.java index 127f6bad617..ad79cdf637a 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/InferenceSpecs.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/InferenceSpecs.java @@ -1,17 +1,13 @@ package io.deephaven.csv; import io.deephaven.annotations.BuildableStyle; -import io.deephaven.qst.type.Type; -import org.immutables.value.Value.Check; +import io.deephaven.csv.parsers.Parser; +import io.deephaven.csv.parsers.Parsers; import org.immutables.value.Value.Default; import org.immutables.value.Value.Immutable; import org.jetbrains.annotations.Nullable; -import java.time.Instant; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Optional; +import java.util.*; /** * Inference specifications contains the configuration and logic for inferring an acceptable parser from string values. @@ -21,6 +17,19 @@ @Immutable @BuildableStyle public abstract class InferenceSpecs { + public static final List> STRINGS_PARSERS = Parsers.STRINGS; + + public static final List> MINIMAL_PARSERS = Parsers.MINIMAL; + + public static final List> STANDARD_PARSERS = Parsers.DEFAULT; + + public static final List> STANDARD_TIMES_PARSERS = Parsers.STANDARD_TIMES_PARSERS; + + public static final List> STANDARD_MILLITIMES_PARSERS = Parsers.STANDARD_MILLITIMES_PARSERS; + + public static final List> STANDARD_MICROTIMES_PARSERS = Parsers.STANDARD_MICROTIMES_PARSERS; + + public static final List> STANDARD_NANOTIMES_PARSERS = Parsers.STANDARD_NANOTIMES_PARSERS; /** * Creates a builder for {@link InferenceSpecs}. @@ -34,178 +43,56 @@ public static Builder builder() { /** * The string-only inference. * - *

- * Contains the following parsers: - * - *

    - *
  • {@link Parser#STRING}
  • - *
  • {@link Parser#INSTANT}
  • - *
  • {@link Parser#SHORT}
  • - *
  • {@link Parser#INT}
  • - *
  • {@link Parser#LONG}
  • - *
  • {@link Parser#DOUBLE}
  • - *
  • {@link Parser#BOOL}
  • - *
  • {@link Parser#CHAR}
  • - *
  • {@link Parser#BYTE}
  • - *
  • {@link Parser#FLOAT}
  • - *
- * - * Uses the default {@link #onNullParser()}. - * - *

- * Note: the non-string parsers are only relevant when the appropriate {@link #limitToType(Type)} is invoked. - * * @return the string-only inference */ public static InferenceSpecs strings() { - return builder().addParsers( - Parser.STRING, - Parser.INSTANT, - Parser.SHORT, - Parser.INT, - Parser.LONG, - Parser.DOUBLE, - Parser.BOOL, - Parser.CHAR, - Parser.BYTE, - Parser.FLOAT) - .build(); + return builder().addAllParsers(STRINGS_PARSERS).build(); } /** * The "minimal" inference. - * - *

- * Contains the following parsers: - * - *

    - *
  • {@link Parser#INSTANT}
  • - *
  • {@link Parser#LONG}
  • - *
  • {@link Parser#DOUBLE}
  • - *
  • {@link Parser#BOOL}
  • - *
  • {@link Parser#STRING}
  • - *
  • {@link Parser#BYTE}
  • - *
  • {@link Parser#SHORT}
  • - *
  • {@link Parser#INT}
  • - *
  • {@link Parser#FLOAT}
  • - *
  • {@link Parser#CHAR}
  • - *
- * - * Uses the default {@link #onNullParser()}. - * - *

- * Note: the byte, short, int, float, and char parsers are only relevant when the appropriate - * {@link #limitToType(Type)} is invoked. - * - * @return the minimal inference */ public static InferenceSpecs minimal() { - return builder().addParsers( - Parser.INSTANT, - Parser.LONG, - Parser.DOUBLE, - Parser.BOOL, - Parser.STRING, - Parser.BYTE, - Parser.SHORT, - Parser.INT, - Parser.FLOAT, - Parser.CHAR) - .build(); + return builder().addAllParsers(MINIMAL_PARSERS).build(); } /** * The "standard" inference, does not parse floats or bytes. - * - *

- * Contains the following parsers: - * - *

    - *
  • {@link Parser#INSTANT}
  • - *
  • {@link Parser#SHORT}
  • - *
  • {@link Parser#INT}
  • - *
  • {@link Parser#LONG}
  • - *
  • {@link Parser#DOUBLE}
  • - *
  • {@link Parser#BOOL}
  • - *
  • {@link Parser#CHAR}
  • - *
  • {@link Parser#STRING}
  • - *
  • {@link Parser#BYTE}
  • - *
  • {@link Parser#FLOAT}
  • - *
- * - * Uses the default {@link #onNullParser()}. - * - *

- * Note: the byte and float parsers are only relevant when the appropriate {@link #limitToType(Type)} is invoked. - * - * @return the standard inference */ public static InferenceSpecs standard() { - return builder().addParsers( - Parser.INSTANT, - Parser.SHORT, - Parser.INT, - Parser.LONG, - Parser.DOUBLE, - Parser.BOOL, - Parser.CHAR, - Parser.STRING, - Parser.BYTE, - Parser.FLOAT) - .build(); + return builder().addAllParsers(STANDARD_PARSERS).build(); } /** * The standard parsers with additional {@link java.time.Instant}-based parsing. * - *

- * Contains the following parsers: - * - *

    - *
  • {@link Parser#INSTANT}
  • - *
  • {@link Parser#INSTANT_LEGACY}
  • - *
  • {@link Parser#epochAny21stCentury(Parser)}, with {@link Parser#LONG}
  • - *
  • {@link Parser#SHORT}
  • - *
  • {@link Parser#INT}
  • - *
  • {@link Parser#LONG}
  • - *
  • {@link Parser#DOUBLE}
  • - *
  • {@link Parser#BOOL}
  • - *
  • {@link Parser#CHAR}
  • - *
  • {@link Parser#STRING}
  • - *
  • {@link Parser#BYTE}
  • - *
  • {@link Parser#FLOAT}
  • - *
- * - * Uses the default {@link #onNullParser()}. - * - *

- * Note: the byte and float parsers are only relevant when the appropriate {@link #limitToType(Type)} is invoked. - * * @return the standard times inference */ public static InferenceSpecs standardTimes() { - final List> parsers = Parser.epochAny21stCentury(Parser.LONG); - return builder().addParsers( - Parser.INSTANT, - Parser.INSTANT_LEGACY, - parsers.get(0), - parsers.get(1), - parsers.get(2), - parsers.get(3), - Parser.SHORT, - Parser.INT, - Parser.LONG, - Parser.DOUBLE, - Parser.BOOL, - Parser.CHAR, - Parser.STRING, - Parser.BYTE, - Parser.FLOAT) - .build(); + return builder().addAllParsers(STANDARD_TIMES_PARSERS).build(); + } + + public static InferenceSpecs milliTimes() { + return builder().addAllParsers(STANDARD_MILLITIMES_PARSERS).build(); + } + + public static InferenceSpecs microTimes() { + return builder().addAllParsers(STANDARD_MICROTIMES_PARSERS).build(); + } + + public static InferenceSpecs nanoTimes() { + return builder().addAllParsers(STANDARD_NANOTIMES_PARSERS).build(); } /** - * The parsers, in preference-based order. + * The parsers that the user wants to participate in type inference. Note that the order that the parsers in this + * list matters only for custom parsers. In particular: + *

    + *
  1. Standard system parsers (singletons from the {@link Parsers} class) will run in their standard precedence + * order, regardless of the order they appear here.
  2. + *
  3. All specified system parsers will be run before any specified custom parsers.
  4. + *
  5. Custom parsers will be run in the order they are specified here.
  6. + *
* * @return the parsers */ @@ -215,153 +102,19 @@ public static InferenceSpecs standardTimes() { * The parser to return when all values are null. May be {@code null}. * *

- * By default, returns a {@link Parser#STRING}. + * By default, returns a {@link Parsers#STRING}. * * @return the on-null values parser */ @Default @Nullable - public Parser onNullParser() { - return Parser.STRING; - } - - /** - * Filters out all parsers that do not have {@code type}. - * - *

- * {@link #onNullParser()} will be set to the first parser that matches {@code type}. - * - * @param type the type to limit to - * @return the new inference based on type - */ - public InferenceSpecs limitToType(Type type) { - Parser first = null; - final Builder builder = builder(); - for (Parser parser : parsers()) { - if (type.equals(parser.type())) { - builder.addParsers(parser); - if (first == null) { - first = parser; - } - } - } - return builder.onNullParser(first).build(); - } - - /** - * Finds the best parser by checking and eliminating parsers based on {@link Parser#canParse(String)}. The returned - * parser will be the lowest indexed parser remaining based on the order specified in {@link #parsers()}. - * - *

- * When all {@code values} are null, the returned value will be an optional that wraps {@link #onNullParser()}. - * - * @param values the values to be inferred - * @return the best parser, if any - */ - public Optional> infer(Iterator values) { - final List> candidates = collect(); - final List> hasParsed = new ArrayList<>(); - boolean allNull = true; - while (values.hasNext() && !candidates.isEmpty()) { - final String item = values.next(); - if (item != null) { - allNull = false; - if (candidates.size() <= 1) { - break; - } - hasParsed.clear(); - final Iterator> it = candidates.iterator(); - NEXT_PARSER: while (it.hasNext()) { - final Parser parser = it.next(); - for (Parser alreadyParsed : hasParsed) { - // If a more specific parser has already run, we know we don't need to check this parser. - // For example, if SHORT has already successfully parsed, we don't need to check INT. - // isSuperset(INT, SHORT) == true - if (isSuperset(parser, alreadyParsed)) { - // Note: we *don't* have to add parser to hasParsed, since superset properties are - // transitive - continue NEXT_PARSER; - } - } - if (parser.canParse(item)) { - hasParsed.add(parser); - } else { - it.remove(); - } - } - } - } - if (allNull) { - return Optional.ofNullable(onNullParser()); - } - return candidates.stream().findFirst(); - } - - @Check - final void checkNonEmpty() { - if (parsers().isEmpty()) { - throw new IllegalArgumentException("Must provide at least one parser for inference"); - } - } - - private List> collect() { - final List> collected = new ArrayList<>(); - for (Parser candidate : parsers()) { - // If anything we've already collected is a superset of the candidate, discard the candidate. - // For example, if INT is already collected, we don't need to even consider SHORT. - boolean useCandidate = true; - for (Parser actual : collected) { - if (isSuperset(actual, candidate)) { - useCandidate = false; - break; - } - } - if (useCandidate) { - collected.add(candidate); - } - } - return collected; - } - - /** - * {@code first} is a superset of {@code second} if {@code first} will parse all the values that {@code second} will - * parse. - */ - private static boolean isSuperset(Parser first, Parser second) { - if (first == Parser.STRING) { - return true; - } - if (first == Parser.DOUBLE) { - return second == Parser.FLOAT - || second == Parser.LONG - || second == Parser.INT - || second == Parser.SHORT - || second == Parser.BYTE; - } - if (first == Parser.FLOAT) { - // Note: *superset* here means will parse all the same (or more) inputs. - // Floats *can* parse everything that Double can parse. - return second == Parser.DOUBLE - || second == Parser.LONG - || second == Parser.INT - || second == Parser.SHORT - || second == Parser.BYTE; - } - if (first == Parser.LONG) { - return second == Parser.INT || second == Parser.SHORT || second == Parser.BYTE; - } - if (first == Parser.INT) { - return second == Parser.SHORT || second == Parser.BYTE; - } - if (first == Parser.SHORT) { - return second == Parser.BYTE; - } - return false; + public Parser nullParser() { + return Parsers.STRING; } public interface Builder { - Builder onNullParser(Parser parser); + Builder nullParser(Parser parser); Builder addParsers(Parser item); diff --git a/extensions/csv/src/main/java/io/deephaven/csv/Parser.java b/extensions/csv/src/main/java/io/deephaven/csv/Parser.java deleted file mode 100644 index 03cdabd38a5..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/Parser.java +++ /dev/null @@ -1,376 +0,0 @@ -package io.deephaven.csv; - -import io.deephaven.time.DateTimeUtils; -import io.deephaven.qst.type.Type; - -import java.time.Duration; -import java.time.Instant; -import java.time.LocalDate; -import java.time.ZoneOffset; -import java.util.Arrays; -import java.util.List; -import java.util.Objects; -import java.util.function.Function; - -/** - * A parser is responsible for parsing strings into parsed types. - * - * @param the parsed type - */ -public class Parser { - - /** - * A parser exception. - */ - public static class ParserException extends IllegalArgumentException { - private final String value; - - public ParserException(String value, String message) { - super(message); - this.value = value; - } - - public ParserException(String value, Throwable cause) { - super(cause); - this.value = value; - } - - public String value() { - return value; - } - } - - /** - * A parser that maps the case-insensitive string "true" to {@code true}, and "false" to {@code false}. - */ - public static final Parser BOOL = new Parser<>(Type.booleanType(), Parser::parseBool); - - /** - * A parser that delegates to {@link Byte#parseByte(String)}. - */ - public static final Parser BYTE = new Parser<>(Type.byteType(), Byte::parseByte); - - /** - * A parses that returns the first character of the string if there is exactly one character in the string. - */ - public static final Parser CHAR = new Parser<>(Type.charType(), Parser::parseChar); - - /** - * A parser that delegates to {@link Short#parseShort(String)}. - */ - public static final Parser SHORT = new Parser<>(Type.shortType(), Short::parseShort); - - /** - * A parser that delegates to {@link Integer#parseInt(String)}. - */ - public static final Parser INT = new Parser<>(Type.intType(), Integer::parseInt); - - /** - * A parser that delegates to {@link Long#parseLong(String)}. - */ - public static final Parser LONG = new Parser<>(Type.longType(), Long::parseLong); - - /** - * A parser that delegates non-trimmable strings to {@link Float#parseFloat(String)}. - * - *

- * Note: if the string is trimmable, the parsing fails. This is to remain consistent with the parsing of integral - * values. - */ - public static final Parser FLOAT = new Parser<>(Type.floatType(), Parser::parseFloat); - - /** - * A parser that delegates non-trimmable strings to {@link Double#parseDouble(String)}. - * - *

- * Note: if the string is trimmable, the parsing fails. This is to remain consistent with the parsing of integral - * values. - */ - public static final Parser DOUBLE = new Parser<>(Type.doubleType(), Parser::parseDouble); - - /** - * A parser that delegates to {@link Instant#parse(CharSequence)}. - */ - public static final Parser INSTANT = new Parser<>(Type.instantType(), Instant::parse); - - /** - * A parser that delegates to {@link DateTimeUtils#convertDateTime(String)}. - */ - public static final Parser INSTANT_LEGACY = new Parser<>(Type.instantType(), Parser::parseAsDateFormat); - - /** - * A naive parser, which returns the same string value it was passed in. - */ - public static final Parser STRING = new Parser<>(Type.stringType(), Function.identity()); - - /** - * A parser that will parse long values as epoch seconds. - * - * @param longParser the long parser - * @param min the minimum instant to infer, may be null - * @param max the maximum instant to infer, may be null - * @return the epoch second parser - * - * @see #epochAnyParser(Parser, Instant, Instant) - */ - public static Parser epochSecondParser(Parser longParser, Instant min, Instant max) { - if (min != null && max != null && min.isAfter(max)) { - throw new IllegalArgumentException(String.format("min is greater that max: %s > %s", min, max)); - } - return new Parser<>(Type.instantType(), s -> parseAsEpochSeconds(longParser, min, max, s)); - } - - /** - * A parser that will parse long values as epoch milliseconds. - * - * @param longParser the long parser - * @param min the minimum instant to infer, may be null - * @param max the maximum instant to infer, may be null - * @return the epoch milli parser - * - * @see #epochAnyParser(Parser, Instant, Instant) - */ - public static Parser epochMilliParser(Parser longParser, Instant min, Instant max) { - if (min != null && max != null && min.isAfter(max)) { - throw new IllegalArgumentException(String.format("min is greater that max: %s > %s", min, max)); - } - return new Parser<>(Type.instantType(), s -> parseAsEpochMillis(longParser, min, max, s)); - } - - /** - * A parser that will parse long values as epoch microseconds. - * - * @param longParser the long parser - * @param min the minimum instant to infer, may be null - * @param max the maximum instant to infer, may be null - * @return the epoch micro parser - * - * @see #epochAnyParser(Parser, Instant, Instant) - */ - public static Parser epochMicroParser(Parser longParser, Instant min, Instant max) { - if (min != null && max != null && min.isAfter(max)) { - throw new IllegalArgumentException(String.format("min is greater that max: %s > %s", min, max)); - } - return new Parser<>(Type.instantType(), s -> parseAsEpochMicros(longParser, min, max, s)); - } - - /** - * A parser that will parse long values as epoch nanoseconds. - * - * @param longParser the long parser - * @param min the minimum instant to infer, may be null - * @param max the maximum instant to infer, may be null - * @return the epoch nano parser - * - * @see #epochAnyParser(Parser, Instant, Instant) - */ - public static Parser epochNanoParser(Parser longParser, Instant min, Instant max) { - if (min != null && max != null && min.isAfter(max)) { - throw new IllegalArgumentException(String.format("min is greater that max: %s > %s", min, max)); - } - return new Parser<>(Type.instantType(), s -> parseAsEpochNanos(longParser, min, max, s)); - } - - /** - * Returns four parsers that will parse long values as epoch seconds, milliseconds, epoch microseconds, and epoch - * nanoseconds based on non-overlapping min/max ranges. - * - *

- * Note: the duration between the epoch and the max must be less than 1000 times the duration between the epoch and - * the min. - * - * @param longParser the long parser - * @param min the minimum instant to infer - * @param max the maximum instant to infer - * @return the epoch milli and micro parsers - * - * @see #epochSecondParser(Parser, Instant, Instant) - * @see #epochMilliParser(Parser, Instant, Instant) - * @see #epochMicroParser(Parser, Instant, Instant) - * @see #epochNanoParser(Parser, Instant, Instant) - * @see #epochAny21stCentury(Parser) - */ - public static List> epochAnyParser(Parser longParser, Instant min, Instant max) { - if (min.isAfter(max)) { - throw new IllegalArgumentException(String.format("min is greater that max: %s > %s", min, max)); - } - if (Duration.between(Instant.EPOCH, max) - .compareTo(Duration.between(Instant.EPOCH, min).multipliedBy(1000)) >= 0) { - throw new IllegalArgumentException("Unable to do proper inference on instants, has overlapping range"); - } - return Arrays.asList( - epochSecondParser(longParser, min, max), - epochMilliParser(longParser, min, max), - epochMicroParser(longParser, min, max), - epochNanoParser(longParser, min, max)); - } - - /** - * Returns four parser that will parse long values as epoch seconds, epoch milliseconds, epoch microseconds, and - * epoch nanoseconds from the 21st century. - * - * @param longParser the long parser - * @return the 21st century epoch second, milli, micro, and nanoseconds parsers - * @see #epochAnyParser(Parser, Instant, Instant) - */ - public static List> epochAny21stCentury(Parser longParser) { - final Instant min = LocalDate.ofYearDay(2000, 1).atStartOfDay().toInstant(ZoneOffset.UTC); - final Instant max = LocalDate.ofYearDay(2100, 1).atStartOfDay().toInstant(ZoneOffset.UTC).minusNanos(1); - return epochAnyParser(longParser, min, max); - } - - private final Type type; - private final Function function; - - /** - * Creates a parser. The {@code function} is passed non-null strings, and expected to return the parsed value, or - * throw an appropriate {@link RuntimeException}. - * - * @param type the type - * @param function the function - */ - public Parser(Type type, Function function) { - this.type = Objects.requireNonNull(type); - this.function = Objects.requireNonNull(function); - } - - public Type type() { - return type; - } - - /** - * Parses {@code value} when non-null, otherwise returns null. - * - *

- * This method catches {@link RuntimeException} from {@code function} and converts them to {@link ParserException}. - * - * @param value the string to parse - * @return the parsed value, or null - * @throws ParserException if {@code value} can't be parsed - */ - public T parse(String value) { - if (value == null) { - return null; - } - try { - return function.apply(value); - } catch (RuntimeException t) { - if (t instanceof ParserException) { - throw t; - } - throw new ParserException(value, t); - } - } - - /** - * Checks if {@code this} parser can parse {@code value}. - * - *

- * {@code null} values are always return true. - * - * @param value the value - * @return true if the value can be parsed. - */ - public boolean canParse(String value) { - if (value == null) { - return true; - } - try { - function.apply(value); - } catch (RuntimeException t) { - return false; - } - return true; - } - - private static boolean parseBool(String value) { - if (value.equalsIgnoreCase("true")) { - return true; - } - if (value.equalsIgnoreCase("false")) { - return false; - } - throw new ParserException(value, "Value is not a boolean"); - } - - private static char parseChar(String value) { - if (value.length() != 1) { - throw new ParserException(value, "Value is not a char"); - } - return value.charAt(0); - } - - private static float parseFloat(String value) { - if (isTrimmable(value)) { - throw new ParserException(value, "Not parsing floats that are trimmable"); - } - return Float.parseFloat(value); - } - - private static double parseDouble(String value) { - if (isTrimmable(value)) { - throw new ParserException(value, "Not parsing doubles that are trimmable"); - } - return Double.parseDouble(value); - } - - private static boolean isTrimmable(String value) { - return !value.isEmpty() && (value.charAt(0) <= ' ' || value.charAt(value.length() - 1) <= ' '); - } - - private static Instant parseAsDateFormat(String value) { - return DateTimeUtils.convertDateTime(value).getInstant(); - } - - private static Instant parseAsEpochSeconds(Parser longParser, Instant min, Instant max, String value) { - final long epochSecond = longParser.parse(value); - final Instant instant = Instant.ofEpochSecond(epochSecond); - if (min != null && instant.isBefore(min)) { - throw new ParserException(value, "Long seconds is less than min instant"); - } - if (max != null && instant.isAfter(max)) { - throw new ParserException(value, "Long seconds is greater than max instant"); - } - return instant; - } - - private static Instant parseAsEpochMillis(Parser longParser, Instant min, Instant max, String value) { - final long epochMilli = longParser.parse(value); - final Instant instant = Instant.ofEpochMilli(epochMilli); - if (min != null && instant.isBefore(min)) { - throw new ParserException(value, "Long millis is less than min instant"); - } - if (max != null && instant.isAfter(max)) { - throw new ParserException(value, "Long millis is greater than max instant"); - } - return instant; - } - - private static Instant parseAsEpochMicros(Parser longParser, Instant min, Instant max, String value) { - final long epochMicro = longParser.parse(value); - final long epochSecond = Math.floorDiv(epochMicro, 1_000_000); - final int nanoAdj = (int) Math.floorMod(epochMicro, 1_000_000) * 1_000; - final Instant instant = Instant.ofEpochSecond(epochSecond, nanoAdj); - if (min != null && instant.isBefore(min)) { - throw new ParserException(value, "Long micros is less than min instant"); - } - if (max != null && instant.isAfter(max)) { - throw new ParserException(value, "Long micros is greater than max instant"); - } - return instant; - } - - private static Instant parseAsEpochNanos(Parser longParser, Instant min, Instant max, String value) { - final long epochNano = longParser.parse(value); - final long epochSecond = Math.floorDiv(epochNano, 1_000_000_000); - final int nanoAdj = (int) Math.floorMod(epochNano, 1_000_000_000); - final Instant instant = Instant.ofEpochSecond(epochSecond, nanoAdj); - if (min != null && instant.isBefore(min)) { - throw new ParserException(value, "Long nanos is less than min instant"); - } - if (max != null && instant.isAfter(max)) { - throw new ParserException(value, "Long nanos is greater than max instant"); - } - return instant; - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/containers/ByteSlice.java b/extensions/csv/src/main/java/io/deephaven/csv/containers/ByteSlice.java new file mode 100644 index 00000000000..f4ce71302f4 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/containers/ByteSlice.java @@ -0,0 +1,102 @@ +package io.deephaven.csv.containers; + +/** + * An object that represents a slice of byte data. This object is intended to be reusable. + */ +public final class ByteSlice { + /** + * The underlying data. + */ + private byte[] data; + /** + * The index of the first data element. + */ + private int begin; + /** + * The index that is one past the last data element. + */ + private int end; + + /** + * Make an empty ByteSlice with a null underlying array. + */ + public ByteSlice() {} + + /** + * Constructs a ByteSlice from the half-open interval [{@code begin}, {@code end}) of the array {@code data}. + */ + public ByteSlice(final byte[] data, final int begin, final int end) { + reset(data, begin, end); + } + + /** + * Reset the ByteSlice to the half-open interval [{@code begin}, {@code end}) of the array {@code data}. + */ + public void reset(final byte[] data, final int begin, final int end) { + this.data = data; + this.begin = begin; + this.end = end; + } + + /** + * Gets the 'begin' field of the slice + */ + public int begin() { + return begin; + } + + /** + * Gets the 'end' field of the slice. + */ + public int end() { + return end; + } + + /** + * Sets the 'begin' field of the slice. + */ + public void setBegin(int begin) { + this.begin = begin; + } + + /** + * Sets the 'end' field of the slice. + */ + public void setEnd(int end) { + this.end = end; + } + + /** + * Gets the first character of the slice. The behavior is unspecified if the slice is empty. + */ + public byte front() { + return data[begin]; + } + + /** + * Gets the last character of the slice. The behavior is unspecified if the slice is empty. + */ + public byte back() { + return data[end - 1]; + } + + /** + * Gets the underlying array from the slice. + */ + public byte[] data() { + return data; + } + + /** + * Gets the size of the slice. + */ + public int size() { + return end - begin; + } + + @Override + public String toString() { + final int size = end - begin; + return size == 0 ? "" : new String(data, begin, end - begin); + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/containers/CharSlice.java b/extensions/csv/src/main/java/io/deephaven/csv/containers/CharSlice.java new file mode 100644 index 00000000000..0877d88ab59 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/containers/CharSlice.java @@ -0,0 +1,112 @@ +package io.deephaven.csv.containers; + +/** + * An object that represents a slice of char data. This object is intended to be reusable. + */ +public final class CharSlice { + /** + * The underlying data. + */ + private char[] data; + /** + * The index of the first data element. + */ + private int begin; + /** + * The index that is one past the last data element. + */ + private int end; + + /** + * Make an empty CharSlice with a null underlying array. + */ + public CharSlice() {} + + /** + * Reset the CharSlice to the half-open interval [{@code begin}, {@code end}) of the array {@code data}. + */ + public void reset(char[] data, int begin, int end) { + this.data = data; + this.begin = begin; + this.end = end; + } + + /** + * Copies the slice to the destination array, starting at the specified destination position, converting chars to + * bytes as it goes. The behavior is unspecified if the CharSlice contains characters that cannot fit in a byte. + */ + public void copyTo(byte[] dest, int destOffset) { + for (int cur = begin; cur != end; ++cur) { + dest[destOffset++] = (byte) data[cur]; + } + } + + /** + * Copies the slice to the destination array, starting at the specified destination position. + */ + public void copyTo(char[] dest, int destOffset) { + System.arraycopy(data, begin, dest, destOffset, end - begin); + } + + /** + * Gets the 'begin' field of the slice + */ + public int begin() { + return begin; + } + + /** + * Gets the 'end' field of the slice. + */ + public int end() { + return end; + } + + /** + * Sets the 'begin' field of the slice. + */ + public void setBegin(int begin) { + this.begin = begin; + } + + /** + * Sets the 'end' field of the slice. + */ + public void setEnd(int end) { + this.end = end; + } + + /** + * Gets the first character of the slice. The behavior is unspecified if the slice is empty. + */ + public char front() { + return data[begin]; + } + + /** + * Gets the last character of the slice. The behavior is unspecified if the slice is empty. + */ + public char back() { + return data[end - 1]; + } + + /** + * Gets the underlying array from the slice. + */ + public char[] data() { + return data; + } + + /** + * Gets the size of the slice. + */ + public int size() { + return end - begin; + } + + @Override + public String toString() { + final int size = end - begin; + return size == 0 ? "" : new String(data, begin, end - begin); + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/containers/GrowableCharBuffer.java b/extensions/csv/src/main/java/io/deephaven/csv/containers/GrowableCharBuffer.java new file mode 100644 index 00000000000..ed5d5276bef --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/containers/GrowableCharBuffer.java @@ -0,0 +1,72 @@ +package io.deephaven.csv.containers; + +/** + * This is like TCharArrayList except that you can get at the underlying data buffer and use it for your own purposes, + * assuming you know what you're doing. We exploit this ability to (temporarily) point our slices at the underlying + * array while we are processing slices. In terms of expected usage, this class is only used for holding the data for + * cells, and only when the cell has escaped characters (like escaped quotes) or the cell spans more than one input + * chunk (in which case we can no longer do the trick where we point a slice directly at the array that buffers our + * input). Therefore the max size of this data structure is equal to the size of the largest cell in the input (which + * likely to be in the 10s or 100s of characters). Since it's expected to be modest in size, we don't worry too much + * about our growth strategy, which simply involves doubling when we run out of space. In practice for "normal" input, + * this object probably never reallocates. + */ +public final class GrowableCharBuffer { + private static final int INITIAL_BUFFER_SIZE = 1024; + + /** + * Underlying buffer. Grows as needed. + */ + private char[] data = new char[INITIAL_BUFFER_SIZE]; + /** + * Current size of the data. + */ + private int size = 0; + + /** + * Appends 'srcSize' characters from 'src', starting at 'srcOffset'. + */ + public void append(char[] src, int srcOffset, int srcSize) { + ensure(srcSize); + System.arraycopy(src, srcOffset, data, size, srcSize); + size += srcSize; + } + + /** + * Ensure that the buffer can hold at least 'additionalSize' items. + */ + private void ensure(int additionalSize) { + final int sizeNeeded = Math.addExact(size, additionalSize); + if (sizeNeeded <= data.length) { + return; + } + + // Ensuring that we always at least double the buffer, but we may not always + // follow powers of two + final int newSize = Math.max(sizeNeeded, Math.multiplyExact(size, 2)); + final char[] newData = new char[newSize]; + System.arraycopy(data, 0, newData, 0, size); + data = newData; + } + + /** + * Clear the buffer. + */ + public void clear() { + size = 0; + } + + /** + * Access the underlying data array. + */ + public char[] data() { + return data; + } + + /** + * The current size. + */ + public int size() { + return size; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageConstants.java b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageConstants.java new file mode 100644 index 00000000000..4136384e990 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageConstants.java @@ -0,0 +1,39 @@ +package io.deephaven.csv.densestorage; + +/** + * Constants that control the behavior of the {@link DenseStorageWriter} and {@link DenseStorageReader}. + */ +public class DenseStorageConstants { + /** + * When input strings are less than this threshold, we pack them tightly into a chunk. When they are greater than or + * equal to this threshold, we allocate them directly as their own individual byte or char arrays as appropriate. + */ + public static final int LARGE_THRESHOLD = 1024; + /** + * Size of the "control queue" blocks. Somewhat arbitrary but should be large-ish. We have arbitrarily chosen + * 100,000 here. + */ + public static final int CONTROL_QUEUE_SIZE = 100_000; + /** + * Size of the "packed queue" char and byte blocks. The number chosen in somewhat arbitrary but it should be + * large-ish (100K? 1M?) for performance and a decent multiple of LARGE_THRESHOLD to avoid wasting too much space at + * the end of each block. By making it 1024x the size of LARGE_THRESHOLD, we can show that the fraction of wasted + * space at the end of each block is not more than (1/1024). + */ + public static final int PACKED_QUEUE_SIZE = LARGE_THRESHOLD * 1024; + /** + * Size of the "array queue". Somewhat arbitrary but should be large-ish. We have arbitrarily chosen 100K here. 10K + * might also be reasonable. + */ + public static final int ARRAY_QUEUE_SIZE = 100_000; + /** + * This sentinel value is used to indicate that the next value being read is not a packed (char or byte) string but + * is instead its own byte array. + */ + public static final int LARGE_BYTE_ARRAY_SENTINEL = Integer.MAX_VALUE; + /** + * This sentinel value is used to indicate that the next value being read is not a packed (char or byte) string but + * is instead its own char array. + */ + public static final int LARGE_CHAR_ARRAY_SENTINEL = Integer.MIN_VALUE; +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageReader.java b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageReader.java new file mode 100644 index 00000000000..89eb0ad64c9 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageReader.java @@ -0,0 +1,102 @@ + +package io.deephaven.csv.densestorage; + +import io.deephaven.csv.containers.ByteSlice; +import io.deephaven.csv.containers.CharSlice; +import io.deephaven.csv.util.CsvReaderException; +import org.apache.commons.lang3.mutable.MutableBoolean; +import org.apache.commons.lang3.mutable.MutableInt; + +/** + * Companion to the {@link DenseStorageWriter}. See the documentation there for details. + */ +public final class DenseStorageReader { + /** + * For the return type of {@link #tryGetNextSlice}. Indicates whether the populated slice was bytes, chars, or none + * (if there is no next slice). + */ + public enum SliceType { + None, Bytes, Chars + } + + /** + * Byte sequences < DENSE_THRESHOLD are compactly stored here + */ + private final QueueReader.ByteReader byteReader; + /** + * Char sequences < DENSE_THRESHOLD are compactly stored here + */ + private final QueueReader.CharReader charReader; + /** + * Byte sequences >= DENSE_THRESHOLD are stored here + */ + private final QueueReader.ByteArrayReader largeByteArrayReader; + /** + * Char sequences >= DENSE_THRESHOLD are stored here + */ + private final QueueReader.CharArrayReader largeCharArrayReader; + /** + * Control bytes (lengths, negated lengths, or sentinels). See DenseStorageWriter. + */ + private final QueueReader.IntReader controlReader; + /** + * For the "out" parameter of controlReader.tryGetInt() + */ + private final MutableInt intHolder = new MutableInt(); + + /** + * Constructor. + */ + public DenseStorageReader(final QueueReader.IntReader controlReader, + final QueueReader.ByteReader byteReader, + final QueueReader.CharReader charReader, + final QueueReader.ByteArrayReader largeByteArrayReader, + final QueueReader.CharArrayReader largeCharArrayReader) { + this.controlReader = controlReader; + this.byteReader = byteReader; + this.charReader = charReader; + this.largeByteArrayReader = largeByteArrayReader; + this.largeCharArrayReader = largeCharArrayReader; + } + + /** + * Get the next slice from one of the inner QueueReaders. Uses data in the 'controlReader' to figure out which + * QueueReader the next slice is coming from. + * + * @param bs If the method returns SliceType.Bytes, the contents of this parameter will be updated. + * @param cs If the method returns SliceType.Chars, the contents of this parameter will be updated. + * @return {@link SliceType#Bytes} or {@link SliceType#Chars} if the operation succeeded. Ohterwise, + * {@link SliceType#None}. + * @throws CsvReaderException If an exception happens. + */ + public SliceType tryGetNextSlice(final ByteSlice bs, final CharSlice cs) + throws CsvReaderException { + if (!controlReader.tryGetInt(intHolder)) { + return SliceType.None; + } + final int control = intHolder.intValue(); + if (control == DenseStorageConstants.LARGE_BYTE_ARRAY_SENTINEL) { + mustSucceed(largeByteArrayReader.tryGetBytes(bs), "largeByteArrayReader"); + return SliceType.Bytes; + } + if (control == DenseStorageConstants.LARGE_CHAR_ARRAY_SENTINEL) { + mustSucceed(largeCharArrayReader.tryGetChars(cs), "largeCharArrayReader"); + return SliceType.Chars; + } + if (control >= 0) { + mustSucceed(byteReader.tryGetBytes(control, bs), "byteReader"); + return SliceType.Bytes; + } + mustSucceed(charReader.tryGetChars(-control, cs), "charReader"); + return SliceType.Chars; + } + + /** + * Convenience method that throws an exception if "success" is false. + */ + private static void mustSucceed(final boolean success, final String what) throws CsvReaderException { + if (!success) { + throw new CsvReaderException("Data unexpectedly exhausted: " + what); + } + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageWriter.java b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageWriter.java new file mode 100644 index 00000000000..eb432fa8686 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageWriter.java @@ -0,0 +1,182 @@ +package io.deephaven.csv.densestorage; + +import io.deephaven.csv.containers.CharSlice; +import io.deephaven.csv.tokenization.RangeTests; + +/** + * The DenseStorageWriter and {@link DenseStorageReader} work in tandem, forming a FIFO queue. The DenseStorageWriter + * writes data, and the {@link DenseStorageReader} reads that data. If the {@link DenseStorageReader} "catches up", it + * will block until the DenseStorageWriter provides more data, or indicates that it is done (via the {@link #finish()} + * method. This synchronization is done at "block" granularity, so the DenseStorageReader can only proceed when the + * DenseStorageWriter has written at least a "block" of data or is done. We allow multiple independent + * {@link DenseStorageReader}s to consume the same underlying data. In our implementation this is used so our type + * inferencer can take a second "pass" over the same input data. + * + *

+ * The point of this object is to store a sequence of (character sequences aka "strings", but not java.lang.String), + * using a small fraction of overhead. The problem with storing every character sequence as a java.lang.String is: + *

    + *
  1. Per-object overhead (probably 8 or 16 bytes depending on pointer width)
  2. + *
  3. The memory cost of holding a reference to that String (again 4 or 8 bytes)
  4. + *
  5. The string has to know its length (4 bytes)
  6. + *
  7. Java characters are 2 bytes even though in practice many strings are ASCII-only and their chars can fit in a + * byte. (Newer Java implementations can store text as bytes, eliminating this objection)
  8. + *
+ * + *

+ * For small strings (say the word "hello" or the input text "12345.6789") the overhead can be 100% or worse. + * + * For our purposes we: + *

    + *
  1. Only need sequential access. i.e. we don't need random access into the sequence of "strings". So we can support a + * model where we can have a forward-only cursor moving over the sequence of "strings".
  2. + *
  3. Don't need to give our caller a data structure that they can hold on to. The caller only gets a "view" (a slice) + * of the current "string" data. The view is invalidated when they move to the next "string"
  4. + *
+ * + * Furthermore we: + *
    + *
  1. Offer a FIFO model where the reader (in a separate thread) can chase the writer but there is not an inordinate + * amount of synchronization overhead (synchronization happens at the block level, not the "string" level).
  2. + *
  3. Have the ability to make multiple Readers which pass over the same underlying data. This is our low-drama way of + * allowing our client to make multiple passes over the data, without complicating the iteration interface, with, e.g., + * a reset method.
  4. + *
  5. Use a linked-list structure so that when all existing readers have move passed a block of data, that block can be + * freed by the garbage collector without any explicit action taken by the reader.
  6. + *
+ * + * If you are familiar with the structure of our inference, you may initially think that this reader-chasing-writer + * garbage collection trick doesn't buy us much because we have a two-phase parser. However, when the inferencer has + * gotten to the last parser in its set of allowable parsers (say, the String parser), or the user has specified that + * there is only one parser for this column, then the code doesn't need to do any inference and can parse the column in + * one pass. In this case, when the reader stays caught up with the writer, we are basically just buffering one block of + * data, not the whole file. + * + *

+ * The implementation used here is to look at the "string" being added to the writer and categorize it along two + * dimensions: + *

    + *
  • Small vs large
  • + *
  • Byte vs char
  • + *
+ * + * These dimensions are broken out in the following way: + *
  • Small byte "strings" are packed into a byte block, and we maintain a linked list of these byte blocks.
  • + *
  • Small char "strings" are packed into a char block, and we maintain a linked list of these char blocks.
  • + *
  • "Large" objects (byte or char sequences with length >= a threshold) are stored directly, meaning a byte[] or + * char[] array is allocated for their data, then a reference to that array is added to a byte-array or char-array + * block. (And again, we maintain a linked list of these byte-array or char-array blocks). It is not typical for CSV + * data to contain a cell this large, but the feature is there for completeness. We do not want want large "strings" to + * contaminate our byte and char blocks because they would not likely pack into them tightly (it would become more + * likely to have allocated blocks with unused storage at the end, because the last big string wouldn't fit in the + * current block). It's OK to keep them on their own because by definition, large "strings" are not going to have much + * overhead, as a percentage of the size of their text content.
  • + * + */ +public final class DenseStorageWriter { + /** + * The ints in this array indicate where the next item is stored: + *
      + *
    • Integer.MIN_VALUE: largeStringWriter.
    • + *
    • Integer.MAX_VALUE: largeByteWriter.
    • + *
    • == 0: no bytes or characters, so they're not stored anywhere. Will be interpreted as a ByteSlice with + * arbitrary byte data and length 0.
    • + *
    • < 0:charWriter (the number of chars is the negative of this value)
    • + *
    • > 0:byteWriter (the number of chars is equal to this value)
    • + *
    • + *
    + */ + private final QueueWriter.IntWriter controlWriter; + /** + * Byte sequences < DENSE_THRESHOLD are compactly stored here + */ + private final QueueWriter.ByteWriter byteWriter; + /** + * Char sequences < DENSE_THRESHOLD are compactly stored here + */ + private final QueueWriter.CharWriter charWriter; + /** + * Byte sequences >= DENSE_THRESHOLD are stored here + */ + private final QueueWriter.ByteArrayWriter largeByteArrayWriter; + /** + * Strings >= DENSE_THRESHOLD are stored here + */ + private final QueueWriter.CharArrayWriter largeCharArrayWriter; + + /** + * Constructor + */ + public DenseStorageWriter() { + this.controlWriter = new QueueWriter.IntWriter(DenseStorageConstants.CONTROL_QUEUE_SIZE); + this.byteWriter = new QueueWriter.ByteWriter(DenseStorageConstants.PACKED_QUEUE_SIZE); + this.charWriter = new QueueWriter.CharWriter(DenseStorageConstants.PACKED_QUEUE_SIZE); + this.largeByteArrayWriter = new QueueWriter.ByteArrayWriter(DenseStorageConstants.ARRAY_QUEUE_SIZE); + this.largeCharArrayWriter = new QueueWriter.CharArrayWriter(DenseStorageConstants.ARRAY_QUEUE_SIZE); + } + + public DenseStorageReader newReader() { + return new DenseStorageReader( + controlWriter.newReader(), + byteWriter.newReader(), + charWriter.newReader(), + largeByteArrayWriter.newReader(), + largeCharArrayWriter.newReader()); + + } + + /** + * Append a CharSlice to the queue. The data will be diverted to one of the four specialized underlying queues, + * depending on its size and byte vs char nature. + */ + public void append(final CharSlice cs) { + final boolean fctrl; + final int size = cs.size(); + final boolean isByteRepresentable = RangeTests.isByteRepresentable(cs.data(), cs.begin(), cs.end()); + if (size >= DenseStorageConstants.LARGE_THRESHOLD) { + if (isByteRepresentable) { + final byte[] data = new byte[size]; + cs.copyTo(data, 0); + largeByteArrayWriter.addByteArray(data); + fctrl = controlWriter.addInt(DenseStorageConstants.LARGE_BYTE_ARRAY_SENTINEL); + } else { + final char[] data = new char[size]; + cs.copyTo(data, 0); + largeCharArrayWriter.addCharArray(data); + fctrl = controlWriter.addInt(DenseStorageConstants.LARGE_CHAR_ARRAY_SENTINEL); + } + } else { + // size < DenseStorageConstants.LARGE_THRESHOLD + if (isByteRepresentable) { + byteWriter.addBytesFromCharSlice(cs); + fctrl = controlWriter.addInt(size); + } else { + charWriter.addChars(cs); + fctrl = controlWriter.addInt(-size); + } + } + // If the control queue flushed, then flush all the data queues, so the reader doesn't block for a long time + // waiting for some unflushed data queue. One might worry this this is inefficient, but (a) it doesn't happen + // very often and (b) in our queue code, partially-filled blocks can share non-overlapping parts of their + // large underlying data array, so it's not too wasteful. Put another way, flushing an empty queue does nothing; + // flushing a partially-filled queue allocates a new QueueNode but not a new underlying data array; + // flushing a full queue will allocates a new QueueNode and (at the next write) a new underlying data array. + if (fctrl) { + byteWriter.flush(); + charWriter.flush(); + largeByteArrayWriter.flush(); + largeCharArrayWriter.flush(); + } + } + + /** + * Call this method to indicate when you are finished writing to the queue. + */ + public void finish() { + controlWriter.finish(); + byteWriter.finish(); + charWriter.finish(); + largeByteArrayWriter.finish(); + largeCharArrayWriter.finish(); + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueNode.java b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueNode.java new file mode 100644 index 00000000000..def822d5d83 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueNode.java @@ -0,0 +1,28 @@ +package io.deephaven.csv.densestorage; + +/** + * Linked list node that holds data for a {@link DenseStorageWriter} or {@link DenseStorageReader}. All fields are + * immutable except the "next" field. Synchronization for reading/writing the "next" field is managed by the + * {@link DenseStorageWriter} and {@link DenseStorageReader}. + */ +public final class QueueNode { + public final TARRAY data; + public final int begin; + public final int end; + public final boolean isLast; + /** + * Readers and writers of this field have arranged to synchronize with each other. + */ + public QueueNode next; + + /** + * Constructor. Sets this queue node to represent the half-open interval ['begin','end') of the array 'data'. + */ + public QueueNode(TARRAY data, int begin, int end, boolean isLast) { + this.data = data; + this.begin = begin; + this.end = end; + this.isLast = isLast; + this.next = null; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueReader.java b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueReader.java new file mode 100644 index 00000000000..6e678ef27c0 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueReader.java @@ -0,0 +1,288 @@ +package io.deephaven.csv.densestorage; + +import io.deephaven.csv.containers.ByteSlice; +import io.deephaven.csv.containers.CharSlice; +import org.apache.commons.lang3.mutable.MutableInt; + +/** + * Companion to the {@link QueueWriter}. See the documentation there for details. + */ +public class QueueReader { + /** + * Sync object which synchronizes access to the "next" fields of every node in our linked list. Shared with the + * QueueWriter. + */ + private final Object sync; + /** + * Current node. + */ + private QueueNode node; + /** + * Current block we are reading from, extracted from the current node. + */ + protected TARRAY genericBlock; + /** + * Current offset in the current block. Updated as we read data. When the value reaches "end", then data in this + * block is exhausted. + */ + protected int current; + /** + * "end" offset of the current block. + */ + protected int end; + + /** + * Constructor. + */ + protected QueueReader(Object sync, QueueNode node) { + this.sync = sync; + this.node = node; + this.genericBlock = null; + this.current = 0; + this.end = 0; + } + + /** + * This method exists as a helper method for a subclass' tryGetXXX method. A typical implementation is in + * CharReader: + * + *
    +     * if (current + size > end) {
    +     *     if (!tryRefill(size)) {
    +     *         return false;
    +     *     }
    +     *     typedBlock = genericBlock;
    +     * }
    +     * 
    + * + * The "if" in the caller is actually checking for two cases in a single comparison. One is a normal "buffer empty, + * needs to be refilled" case. The other is a bad "something went terribly wrong" case. + * + * Case 1: The buffer is empty. Then current == end, and therefore current + size > end (given size > 0, which it + * always is), and the 'if' evaluates to true, so the tryGetXXX code would call us. Then we would refill the buffer + * and proceed. + * + * Case 2: The buffer isn't empty, but size goes beyond the end of the block. Then current < end but current + size + * > end. Then the 'if' evaluates to true, so the tryGetXXX code would call us. But then the first line of our + * method would detect the past-the-end condition and throw an exception. + * + * Case 3: The buffer isn't empty, and size does not go beyond the end of the block. Then the 'if' evaluates to + * false (in other words there's enough data left for the caller), so the tryGetXXX method doesn't need to call us. + */ + protected boolean tryRefill(int size) { + if (current != end) { + throw new RuntimeException("Logic error: slice straddled block"); + } + while (current == end) { + if (node.isLast) { + // Hygeine. + node = null; + genericBlock = null; + current = 0; + end = 0; + return false; + } + synchronized (sync) { + while (node.next == null) { + catchyWait(sync); + } + node = node.next; + genericBlock = node.data; + current = node.begin; + end = node.end; + } + } + if (end - current < size) { + throw new RuntimeException(String.format("Logic error: got short block: expected at least %d, got %d", + size, end - current)); + } + return true; + } + + /** + * Call Object.wait() but suppress the need to deal with checked InterruptedExceptions. + */ + private static void catchyWait(Object o) { + try { + o.wait(); + } catch (InterruptedException ie) { + throw new RuntimeException("Logic error: thread interrupted: can't happen"); + } + } + + /** + * A QueueReader specialized for bytes. + */ + public static final class ByteReader extends QueueReader { + /** + * Typed version of the current block. Saves us some implicit casting from the generic TARRAY object. This is a + * performance optimization that may not matter. + */ + private byte[] typedBlock; + + /** + * Constructor. + */ + public ByteReader(final Object sync, final QueueNode head) { + super(sync, head); + } + + /** + * Tries to get the next ByteSlice from the reader. + * + * @param size The exact number of chars to place in the slice. + * @param bs The result, modified in place. + * @return true If the next ByteSlice was successfully read; false if the end of input was reached. + */ + public boolean tryGetBytes(final int size, final ByteSlice bs) { + if (current + size > end) { + if (!tryRefill(size)) { + return false; + } + typedBlock = genericBlock; + } + bs.reset(typedBlock, current, current + size); + current += size; + return true; + } + } + + + /** + * A QueueReader specialized for chars. + */ + public static final class CharReader extends QueueReader { + /** + * Typed version of the current block. Saves us some implicit casting from the generic TARRAY object. This is a + * performance optimization that may not matter. + */ + private char[] typedBlock; + + /** + * Constructor. + */ + public CharReader(final Object sync, final QueueNode head) { + super(sync, head); + } + + /** + * Tries to get the next CharSlice from the reader. + * + * @param size The exact number of chars to place in the slice. + * @param cs The result, modified in place. + * @return true If the next CharSlice was successfully read; false if the end of input was reached. + */ + public boolean tryGetChars(final int size, final CharSlice cs) { + if (current + size > end) { + if (!tryRefill(size)) { + return false; + } + typedBlock = genericBlock; + } + cs.reset(typedBlock, current, current + size); + current += size; + return true; + } + } + + /** + * A QueueReader specialized for ints. + */ + public static final class IntReader extends QueueReader { + /** + * Typed version of the current block. Saves us some implicit casting from the generic TARRAY object. This is a + * performance optimization that may not matter. + */ + private int[] typedBlock; + + /** + * Constructor. + */ + public IntReader(Object sync, QueueNode head) { + super(sync, head); + } + + /** + * Tries to get the next integer from the reader. + * + * @param result If the operation succeeds, contains the next integer. Otherwise, the contents are unspecified. + * @return true if the next value was successfully read; false if the end of input was reached. + */ + public boolean tryGetInt(final MutableInt result) { + if (current == end) { + if (!tryRefill(1)) { + return false; + } + typedBlock = genericBlock; + } + result.setValue(typedBlock[current++]); + return true; + } + } + + /** + * A QueueReader specialized for byte arrays. + */ + public static final class ByteArrayReader extends QueueReader { + /** + * Typed version of the current block. Saves us some implicit casting from the generic TARRAY object. This is a + * performance optimization that may not matter. + */ + private byte[][] typedBlock; + + public ByteArrayReader(final Object sync, final QueueNode head) { + super(sync, head); + } + + /** + * Tries to get the next ByteSlice from the reader. + * + * @param bs The result, modified in place. + * @return true If the next ByteSlice was successfully read; false if the end of input was reached. + */ + public boolean tryGetBytes(final ByteSlice bs) { + if (current == end) { + if (!tryRefill(1)) { + return false; + } + typedBlock = genericBlock; + } + final byte[] data = typedBlock[current++]; + bs.reset(data, 0, data.length); + return true; + } + } + + /** + * A QueueReader specialized for char arrays. + */ + public static final class CharArrayReader extends QueueReader { + /** + * Typed version of the current block. Saves us some implicit casting from the generic TARRAY object. This is a + * performance optimization that may not matter. + */ + private char[][] typedBlock; + + public CharArrayReader(Object sync, QueueNode head) { + super(sync, head); + } + + /** + * Tries to get the next CharSlice from the reader. + * + * @param cs The result, modified in place. + * @return true If the next CharSlice was successfully read; false if the end of input was reached. + */ + public boolean tryGetChars(CharSlice cs) { + if (current == end) { + if (!tryRefill(1)) { + return false; + } + typedBlock = genericBlock; + } + final char[] data = typedBlock[current++]; + cs.reset(data, 0, data.length); + return true; + } + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueWriter.java b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueWriter.java new file mode 100644 index 00000000000..baa0af34bb5 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueWriter.java @@ -0,0 +1,308 @@ +package io.deephaven.csv.densestorage; + +import io.deephaven.csv.containers.CharSlice; + +import java.util.function.BiFunction; +import java.util.function.IntFunction; + +/** + * The various QueueWriters ({@link ByteWriter}, {@link CharWriter}, etc.) work in tandem with their corresponding + * {@link QueueReader}s ({@link QueueReader.ByteReader}, {@link QueueReader.CharReader}, etc), forming a FIFO queue. The + * QueueWriter writes data, and the {@link QueueReader} reads that data. If the {@link QueueReader} "catches up", it + * will block until the QueueWriter provides more data, or indicates that it is done (via the {@link #finish()} method. + * This synchronization is done at "block" granularity, so the {@link QueueReader} can only proceed when the QueueWriter + * has written at least a "block" of data or is done. We allow multiple independent {@link QueueReader}s to consume the + * same underlying data. In our implementation this is used so our type inferencer can take a second "pass" over the + * same input data. + * + * In our implementation the {@link DenseStorageWriter} and {@link DenseStorageReader} are built out of various + * QueueWriters and {@link QueueReader}s. This explains why the semantics of {@link DenseStorageWriter} and + * {@link DenseStorageReader} are similar to those of the underlying QueueWriters and {@link QueueReader}s. + */ +public class QueueWriter { + /** + * Sync object which synchronizes access to the "next" fields of every node in our linked list. Shared with the + * QueueReader. + */ + private final Object sync; + /** + * Tail of the linked list. We append here when we flush. + */ + private QueueNode tail; + /** + * Size of the chunks we allocate that we pack data into. + */ + protected final int blockSize; + /** + * Lambda for allocating arrays for our chunks. + */ + private final IntFunction arrayFactory; + /** + * Lambda to make a QueueReader of the right subtype. + */ + private final BiFunction, TREADER> readerFactory; + /** + * A flag that says whether it's still early enough to allow QueueReader creation. After the writer starts writing, + * they shouldn't be allowed to create any readers. (This is just because we want to keep the semantics simple). + */ + private boolean allowReaderCreation; + /** + * Current block we writing to. When we flush, we will write it to a new linked list node. + */ + private TARRAY genericBlock; + /** + * Start of the current block. This is typically 0, but not always. If the caller does an early flush (before the + * block is filled), you can have multiple linked list nodes sharing different segments of the same underlying block + * storage. + */ + protected int begin; + /** + * Current offset in the current block. Updated as we write data. When the value reaches "end", then data in this + * block is exhausted. + */ + protected int current; + /** + * End of the current block. The same as genericBlock.length. + */ + protected int end; + + /** + * Constructor. + */ + protected QueueWriter(final int blockSize, + final IntFunction arrayFactory, + final BiFunction, TREADER> readerFactory) { + this.sync = new Object(); + // Creating the linked list with a sentinel object makes linked list manipulation code simpler. + this.tail = new QueueNode<>(null, 0, 0, false); + this.blockSize = blockSize; + this.arrayFactory = arrayFactory; + this.readerFactory = readerFactory; + this.allowReaderCreation = true; + this.genericBlock = null; + this.begin = 0; + this.current = 0; + this.end = 0; + } + + /** + * Caller is finished writing. + */ + public void finish() { + flush(true); + genericBlock = null; // hygeine + begin = 0; + current = 0; + end = 0; + } + + /** + * Make a {@link QueueReader} corresponding to this QueueWriter. You can make as many {@link QueueReader}s as you + * want, but you should make them before you start writing data. + */ + public TREADER newReader() { + if (!allowReaderCreation) { + throw new RuntimeException("Logic error: must allocate readers before writing any data"); + } + return readerFactory.apply(sync, tail); + } + + /** + * This supports an "early flush" for callers like {@link DenseStorageWriter} who want to flush all their queues + * from time to time. + */ + public void flush() { + flush(false); + } + + /** + * Flush can be called at any time... when the block is empty (and hence nothing to flush), when there's some data, + * or when the data is full. + * + * @param isLast Whether this is the last node in the linked list. + */ + private void flush(boolean isLast) { + // Sometimes our users ask us to flush even if there is nothing to flush. + // If the block is an "isLast" block, we need to flush it regardless of whether it contains data. + // Otherwise (if the block is not an "isLast" block), we only flush it if it contains data. + if (!isLast && (current == begin)) { + // No need to flush. + return; + } + + // No more creating readers after the first flush. + allowReaderCreation = false; + + final QueueNode newBlob = new QueueNode<>(genericBlock, begin, current, isLast); + // If this is an early flush (before the block was filled), the next node may share + // the same underlying storage array (but disjoint segments of that array) as the current node. + // To accomplish this, we just advance "begin" to "current" here. At this point in the logic + // we don't care if that leaves the block with zero capacity (begin == end) or not. The decision + // to actually start a new block is done by the addXXX code in our subclasses which eventually + // calls flushAndAllocate. + begin = current; + synchronized (sync) { + tail.next = newBlob; + tail = newBlob; + sync.notifyAll(); + } + } + + /** + * This method exists as a helper method for a subclass' addXXX method. A typical implementation is in CharWriter: + * + *
    +     * final int sliceSize = cs.size();
    +     * final boolean flushHappened = current + sliceSize > end;
    +     * if (flushHappened) {
    +     *   typedBlock = flushAndAllocate(sliceSize);
    +     * }
    +     * ...
    +     * 
    + * + * The "flushHappened" variable (which at the point of its definition would be more precisely interpreted as "flush + * is about to happen") calculates whether the data that currently needs to be written can fit in the current block + * or not. If it can fit, the code continues on to write its data. If it can't fit, the subclass calls this + * flushAndAllocate method to flush the current block to the linked list and allocate a new one. The new block so + * allocated is guaranteed to have at be of size at least 'sizeNeeded'. + */ + protected final TARRAY flushAndAllocate(int sizeNeeded) { + flush(false); + final int capacity = Math.max(blockSize, sizeNeeded); + genericBlock = arrayFactory.apply(capacity); + begin = 0; + current = 0; + end = capacity; + return genericBlock; + } + + /** + * A QueueWriter specialized for bytes. + */ + public static final class ByteWriter extends QueueWriter { + private byte[] typedBlock = null; + + public ByteWriter(final int blockSize) { + super(blockSize, byte[]::new, QueueReader.ByteReader::new); + } + + /** + * Add bytes from a CharSlice to the queue. The conversion from char to byte is provided automatically as a + * convenience. The caller needs to ensure that the characters in the CharSlice are within the range of a byte. + * + * @return true if the add caused a flush to happen prior to the write, false if no flush happened. + */ + public boolean addBytesFromCharSlice(CharSlice cs) { + final int sliceSize = cs.size(); + final boolean flushHappened = current + sliceSize > end; + if (flushHappened) { + typedBlock = flushAndAllocate(sliceSize); + } + cs.copyTo(typedBlock, current); + current += sliceSize; + return flushHappened; + } + } + + /** + * A QueueWriter specialized for chars. + */ + public static final class CharWriter extends QueueWriter { + private char[] typedBlock = null; + + public CharWriter(final int blockSize) { + super(blockSize, char[]::new, QueueReader.CharReader::new); + } + + /** + * Add chars from a CharSlice to the queue. + * + * @return true if the add caused a flush to happen prior to the write, false if no flush happened. + */ + public boolean addChars(CharSlice cs) { + final int sliceSize = cs.size(); + final boolean flushHappened = current + sliceSize > end; + if (flushHappened) { + typedBlock = flushAndAllocate(sliceSize); + } + cs.copyTo(typedBlock, current); + current += sliceSize; + return flushHappened; + } + } + + /** + * A QueueWriter specialized for ints. + */ + public static final class IntWriter extends QueueWriter { + private int[] typedBlock = null; + + public IntWriter(final int blockSize) { + super(blockSize, int[]::new, QueueReader.IntReader::new); + } + + /** + * Add an int to the queue. + * + * @return true if the add caused a flush to happen prior to the write, false if no flush happened. + */ + public boolean addInt(int value) { + final boolean flushHappened = current == end; + if (flushHappened) { + typedBlock = flushAndAllocate(1); + } + typedBlock[current++] = value; + return flushHappened; + } + } + + /** + * A QueueWriter specialized for byte arrays. + */ + public static final class ByteArrayWriter extends QueueWriter { + private byte[][] block = null; + + public ByteArrayWriter(int blobSize) { + super(blobSize, byte[][]::new, QueueReader.ByteArrayReader::new); + } + + /** + * Add a byte array to the queue. + * + * @return true if the add caused a flush to happen prior to the write, false if no flush happened. + */ + public boolean addByteArray(byte[] value) { + final boolean flushHappened = current == end; + if (flushHappened) { + block = flushAndAllocate(1); + } + block[current++] = value; + return flushHappened; + } + } + + /** + * A QueueWriter specialized for char arrays. + */ + public static final class CharArrayWriter extends QueueWriter { + private char[][] block = null; + + public CharArrayWriter(int blobSize) { + super(blobSize, char[][]::new, QueueReader.CharArrayReader::new); + } + + /** + * Add a char array to the queue. + * + * @return true if the add caused a flush to happen prior to the write, false if no flush happened. + */ + public boolean addCharArray(char[] value) { + final boolean flushHappened = current == end; + if (flushHappened) { + block = flushAndAllocate(1); + } + block[current++] = value; + return flushHappened; + } + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/BooleanParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/BooleanParser.java new file mode 100644 index 00000000000..5f9c99eaee5 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/BooleanParser.java @@ -0,0 +1,57 @@ +package io.deephaven.csv.parsers; + +import io.deephaven.csv.parsers.context.SentinelConfiguration; +import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; +import org.apache.commons.lang3.mutable.MutableBoolean; + +/** + * The parser for the boolean type. + */ +public final class BooleanParser implements Parser { + public static BooleanParser INSTANCE = new BooleanParser(); + + private BooleanParser() {} + + @Override + public Sink tryParse(final ParseContext ctx, final IteratorHolder ih, Sink sink, + long current, final long end) throws CsvReaderException { + final byte[] chunk = new byte[DEST_BLOCK_SIZE]; + final MutableBoolean booleanHolder = new MutableBoolean(); + final Tokenizer t = ctx.tokenizer; + final Byte nullValue = ctx.sentinelConfiguration.nullBooleanAsByteValue; + + final boolean appending = sink == null; + if (appending) { + sink = ctx.sinkFactory.makeBooleanAsByteSink(); + } + + int chunkIndex = 0; + do { + if (chunkIndex == chunk.length) { + sink.write(chunk, 0, current, chunkIndex, appending); + current += chunkIndex; + chunkIndex = 0; + } + if (current + chunkIndex == end) { + break; + } + if (ctx.isNullCell(ih)) { + chunk[chunkIndex++] = SentinelConfiguration.assertHasNullValue("boolean as byte", nullValue); + continue; + } + if (!ih.hasBytes()) { + return null; + } + if (!t.tryParseBoolean(ih.bs(), booleanHolder)) { + return null; + } + ctx.isNullOrWidthOneSoFar = false; + chunk[chunkIndex++] = booleanHolder.booleanValue() ? (byte) 1 : (byte) 0; + } while (ih.tryMoveNext()); + sink.write(chunk, 0, current, chunkIndex, appending); + return sink; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/ByteParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/ByteParser.java new file mode 100644 index 00000000000..d811b975aa0 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/ByteParser.java @@ -0,0 +1,68 @@ +package io.deephaven.csv.parsers; + +import io.deephaven.csv.parsers.context.SentinelConfiguration; +import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.tokenization.RangeTests; +import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; +import org.apache.commons.lang3.mutable.MutableLong; + +/** + * The parser for the byte type. + */ +public final class ByteParser implements Parser { + public static final ByteParser INSTANCE = new ByteParser(); + + private ByteParser() {} + + @Override + public Sink tryParse(final ParseContext ctx, final IteratorHolder ih, Sink sink, + long current, final long end) throws CsvReaderException { + final byte[] chunk = new byte[DEST_BLOCK_SIZE]; + final MutableLong longHolder = new MutableLong(); + final Tokenizer t = ctx.tokenizer; + final Byte nullValue = ctx.sentinelConfiguration.nullByteValue; + + final boolean appending = sink == null; + if (appending) { + sink = ctx.sinkFactory.makeByteSink(); + } + + int chunkIndex = 0; + do { + if (chunkIndex == chunk.length) { + sink.write(chunk, 0, current, chunkIndex, appending); + current += chunkIndex; + chunkIndex = 0; + } + if (current + chunkIndex == end) { + break; + } + if (ctx.isNullCell(ih)) { + chunk[chunkIndex++] = SentinelConfiguration.assertHasNullValue("byte", nullValue); + continue; + } + if (!ih.hasBytes()) { + return null; + } + if (!t.tryParseLong(ih.bs(), longHolder)) { + return null; + } + final long value = longHolder.longValue(); + if (!RangeTests.isInRangeForByte(value)) { + return null; + } + if (nullValue != null && value == nullValue) { + // If a sentinel null value is defined, it cannot be present in the input. + return null; + } + if (ih.bs().size() != 1) { + ctx.isNullOrWidthOneSoFar = false; + } + chunk[chunkIndex++] = (byte) value; + } while (ih.tryMoveNext()); + sink.write(chunk, 0, current, chunkIndex, appending); + return sink; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/CharParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/CharParser.java new file mode 100644 index 00000000000..efcb526d661 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/CharParser.java @@ -0,0 +1,57 @@ +package io.deephaven.csv.parsers; + +import io.deephaven.csv.parsers.context.SentinelConfiguration; +import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; + +/** + * The parser for the char type. + */ +public final class CharParser implements Parser { + public static final CharParser INSTANCE = new CharParser(); + + private CharParser() {} + + @Override + public Sink tryParse(final ParseContext ctx, final IteratorHolder ih, Sink sink, + long current, final long end) throws CsvReaderException { + if (!ctx.isNullOrWidthOneSoFar) { + return null; + } + final char[] chunk = new char[DEST_BLOCK_SIZE]; + final Character nullValue = ctx.sentinelConfiguration.nullCharValue; + + final boolean appending = sink == null; + if (appending) { + sink = ctx.sinkFactory.makeCharSink(); + } + + int chunkIndex = 0; + do { + if (chunkIndex == chunk.length) { + sink.write(chunk, 0, current, chunkIndex, appending); + current += chunkIndex; + chunkIndex = 0; + } + if (current + chunkIndex == end) { + break; + } + if (ctx.isNullCell(ih)) { + chunk[chunkIndex++] = SentinelConfiguration.assertHasNullValue("char", nullValue); + continue; + } + if (ih.sliceSize() > 1) { + return null; + } + final char value = ih.hasBytes() ? (char) ih.bs().front() : ih.cs().front(); + if (nullValue != null && value == nullValue) { + // If a sentinel null value is defined, it cannot be present in the input. + return null; + } + chunk[chunkIndex++] = value; + } while (ih.tryMoveNext()); + sink.write(chunk, 0, current, chunkIndex, appending); + return sink; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/DateTimeAsLongParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/DateTimeAsLongParser.java new file mode 100644 index 00000000000..32dcf98ca9c --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/DateTimeAsLongParser.java @@ -0,0 +1,62 @@ +package io.deephaven.csv.parsers; + +import io.deephaven.csv.parsers.context.SentinelConfiguration; +import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; +import org.apache.commons.lang3.mutable.MutableLong; + +/** + * The parser for the Deephaven DateTime (represented as long) type. + */ +public final class DateTimeAsLongParser implements Parser { + public static final DateTimeAsLongParser INSTANCE = new DateTimeAsLongParser(); + + private DateTimeAsLongParser() {} + + @Override + public Sink tryParse(final ParseContext ctx, final IteratorHolder ih, Sink sink, + long current, final long end) throws CsvReaderException { + final long[] chunk = new long[DEST_BLOCK_SIZE]; + final MutableLong dateTimeAsLongHolder = new MutableLong(); + final Tokenizer t = ctx.tokenizer; + final Long nullValue = ctx.sentinelConfiguration.nullDateTimeAsLongValue; + + final boolean appending = sink == null; + if (appending) { + sink = ctx.sinkFactory.makeDateTimeAsLongSink(); + } + + int chunkIndex = 0; + do { + if (chunkIndex == chunk.length) { + sink.write(chunk, 0, current, chunkIndex, appending); + current += chunkIndex; + chunkIndex = 0; + } + if (current + chunkIndex == end) { + break; + } + if (ctx.isNullCell(ih)) { + chunk[chunkIndex++] = SentinelConfiguration.assertHasNullValue("DateTime as long", nullValue); + continue; + } + if (!ih.hasBytes()) { + return null; + } + if (!t.tryParseDateTime(ih.bs(), dateTimeAsLongHolder)) { + return null; + } + final long value = dateTimeAsLongHolder.longValue(); + if (nullValue != null && value == nullValue) { + // If a sentinel null value is defined, it cannot be present in the input. + return null; + } + ctx.isNullOrWidthOneSoFar = false; // No valid DBDateTime is 1 character wide + chunk[chunkIndex++] = value; + } while (ih.tryMoveNext()); + sink.write(chunk, 0, current, chunkIndex, appending); + return sink; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/DoubleParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/DoubleParser.java new file mode 100644 index 00000000000..4790b5d8011 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/DoubleParser.java @@ -0,0 +1,65 @@ +package io.deephaven.csv.parsers; + +import io.deephaven.csv.parsers.context.SentinelConfiguration; +import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; +import org.apache.commons.lang3.mutable.MutableDouble; + +/** + * The parser for the double type. + */ +public final class DoubleParser implements Parser { + public static final DoubleParser INSTANCE = new DoubleParser(); + + private DoubleParser() {} + + @Override + public Sink tryParse(final ParseContext ctx, final IteratorHolder ih, Sink sink, + long current, final long end) throws CsvReaderException { + final double[] chunk = new double[DEST_BLOCK_SIZE]; + final MutableDouble doubleHolder = new MutableDouble(); + final Tokenizer t = ctx.tokenizer; + final Double nullValue = ctx.sentinelConfiguration.nullDoubleValue; + + final boolean appending = sink == null; + if (appending) { + sink = ctx.sinkFactory.makeDoubleSink(); + } + + int chunkIndex = 0; + do { + if (chunkIndex == chunk.length) { + sink.write(chunk, 0, current, chunkIndex, appending); + current += chunkIndex; + chunkIndex = 0; + } + if (current + chunkIndex == end) { + break; + } + if (ctx.isNullCell(ih)) { + chunk[chunkIndex++] = SentinelConfiguration.assertHasNullValue("double", nullValue); + continue; + } + if (!ih.hasBytes()) { + return null; + } + if (!t.tryParseDouble(ih.bs(), doubleHolder)) { + return null; + } + final double value = doubleHolder.doubleValue(); + if (nullValue != null && value == nullValue) { + // If a sentinel null value is defined, it cannot be present in the input. + return null; + } + if (ih.bs().size() > 1) { + // Not an error, but needed in case we eventually fall back to char. + ctx.isNullOrWidthOneSoFar = false; + } + chunk[chunkIndex++] = value; + } while (ih.tryMoveNext()); + sink.write(chunk, 0, current, chunkIndex, appending); + return sink; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/FloatParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/FloatParser.java new file mode 100644 index 00000000000..9bf0185b6fe --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/FloatParser.java @@ -0,0 +1,78 @@ +package io.deephaven.csv.parsers; + +import io.deephaven.csv.parsers.context.SentinelConfiguration; +import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.tokenization.RangeTests; +import io.deephaven.csv.util.CsvReaderException; +import org.apache.commons.lang3.mutable.MutableDouble; + +/** + * The parser for the float type. + */ +public final class FloatParser implements Parser { + public static final FloatParser INSTANCE = new FloatParser(); + + private FloatParser() {} + + @Override + public Sink tryParse(final ParseContext ctx, final IteratorHolder ih, Sink sink, + long current, final long end) throws CsvReaderException { + if (!ctx.hasFewerThan8SigFigsSoFar) { + return null; + } + final float[] chunk = new float[DEST_BLOCK_SIZE]; + final MutableDouble doubleHolder = new MutableDouble(); + final Tokenizer t = ctx.tokenizer; + final Float nullValue = ctx.sentinelConfiguration.nullFloatValue; + + final boolean appending = sink == null; + if (appending) { + sink = ctx.sinkFactory.makeFloatSink(); + } + + int chunkIndex = 0; + do { + if (chunkIndex == chunk.length) { + sink.write(chunk, 0, current, chunkIndex, appending); + current += chunkIndex; + chunkIndex = 0; + } + if (current + chunkIndex == end) { + break; + } + if (ctx.isNullCell(ih)) { + chunk[chunkIndex++] = SentinelConfiguration.assertHasNullValue("float", nullValue); + continue; + } + if (!ih.hasBytes()) { + return null; + } + if (!t.tryParseDouble(ih.bs(), doubleHolder)) { + return null; + } + final double value = doubleHolder.doubleValue(); + if (!RangeTests.isInRangeForFloat(value)) { + return null; + } + if (nullValue != null && value == nullValue) { + // If a sentinel null value is defined, it cannot be present in the input. + return null; + } + if (ctx.doubleParserIsAvailable && + !RangeTests.hasFewerThan8SignificantFigures(ih.bs().data(), ih.bs().begin(), ih.bs().end())) { + // Lots of significant figures, so fall back to double parsing (but only + // if a double parse is available). + return null; + } + if (ih.bs().size() > 1) { + // Not an error, but needed in case we eventually fall back to char. + ctx.isNullOrWidthOneSoFar = false; + } + chunk[chunkIndex++] = (float) value; + } while (ih.tryMoveNext()); + sink.write(chunk, 0, current, chunkIndex, appending); + return sink; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/IntParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/IntParser.java new file mode 100644 index 00000000000..65c624b1133 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/IntParser.java @@ -0,0 +1,76 @@ +package io.deephaven.csv.parsers; + +import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.parsers.context.SentinelConfiguration; +import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.tokenization.RangeTests; +import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.util.CsvReaderException; +import org.apache.commons.lang3.mutable.MutableLong; + +/** + * The parser for the int type. + */ +public final class IntParser implements Parser { + public static final IntParser INSTANCE = new IntParser(); + + private IntParser() {} + + @Override + public Sink tryParse(final ParseContext ctx, final IteratorHolder ih, Sink sink, + long current, final long end) throws CsvReaderException { + final int[] chunk = new int[DEST_BLOCK_SIZE]; + final MutableLong longHolder = new MutableLong(); + final Tokenizer t = ctx.tokenizer; + final Integer nullValue = ctx.sentinelConfiguration.nullIntValue; + + final boolean appending = sink == null; + if (appending) { + sink = ctx.sinkFactory.makeIntSink(); + } + + int chunkIndex = 0; + do { + if ((ih.numConsumed() % 10_000_000) == 0) { + System.out.println(this.getClass().getCanonicalName() + ": Processed: " + ih.numConsumed()); + } + if (chunkIndex == chunk.length) { + sink.write(chunk, 0, current, chunkIndex, appending); + current += chunkIndex; + chunkIndex = 0; + } + if (current + chunkIndex == end) { + break; + } + if (ctx.isNullCell(ih)) { + chunk[chunkIndex++] = SentinelConfiguration.assertHasNullValue("int", nullValue); + continue; + } + if (!ih.hasBytes()) { + return null; + } + if (!t.tryParseLong(ih.bs(), longHolder)) { + return null; + } + final long value = longHolder.longValue(); + if (!RangeTests.isInRangeForInt(value)) { + return null; + } + if (nullValue != null && value == nullValue) { + // If a sentinel null value is defined, it cannot be present in the input. + return null; + } + if (value < -9_999_999 || value > 9_999_999) { + // Not an error, but needed in case we eventually fall back to float. + ctx.hasFewerThan8SigFigsSoFar = false; + } + if (ih.bs().size() > 1) { + // Not an error, but needed in case we eventually fall back to char. + ctx.isNullOrWidthOneSoFar = false; + } + chunk[chunkIndex++] = (int) value; + } while (ih.tryMoveNext()); + sink.write(chunk, 0, current, chunkIndex, appending); + return sink; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/IteratorHolder.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/IteratorHolder.java new file mode 100644 index 00000000000..c4f26bab230 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/IteratorHolder.java @@ -0,0 +1,129 @@ +package io.deephaven.csv.parsers; + +import io.deephaven.csv.densestorage.DenseStorageReader; +import io.deephaven.csv.densestorage.DenseStorageReader.SliceType; +import io.deephaven.csv.containers.ByteSlice; +import io.deephaven.csv.containers.CharSlice; +import io.deephaven.csv.util.CsvReaderException; + +/** + * This class is used to hold the underlying {@link DenseStorageReader} plus some associated helper classes (an + * allocated {@link ByteSlice} and {@link CharSlice} for slice storage, plus a couple of statistics like + * {@link #numConsumed} and {@link #isExhausted}. + */ +public final class IteratorHolder { + private final DenseStorageReader dsr; + /** + * Indicates which slice (if any) has valid contents. + */ + private SliceType activeSlice = SliceType.None; + /** + * Storage for our reusable byte slice. Data inside it is valid after a call to tryMoveNext() returns true, in the + * case where hasBytes has been set to true. + */ + private final ByteSlice bs = new ByteSlice(); + /** + * Storage for our reusable char slice. Data inside it is valid after a call to tryMoveNext() returns true, in the + * case where hasBytes has been set to false. + */ + private final CharSlice cs = new CharSlice(); + /** + * Number of successful calls so far to tryMoveNext (i.e. those that returned true). + */ + private long numConsumed = 0; + /** + * Valid anytime after the first call to tryMoveNext(), but not before. + */ + private boolean isExhausted = false; + + /** + * Constructor. + */ + public IteratorHolder(DenseStorageReader dsr) { + this.dsr = dsr; + } + + /** + * Try to advance to the next item. + * + * @return true if there was a next item. Otherwise false. + */ + public boolean tryMoveNext() throws CsvReaderException { + activeSlice = dsr.tryGetNextSlice(bs, cs); + if (activeSlice == SliceType.None) { + isExhausted = true; + return false; + } + ++numConsumed; + return true; + } + + /** + * Convert the current slice (whether byte or char) to a String. + * + * @return The slice as String. + */ + public String sliceToString() { + return hasBytes() ? bs.toString() : cs.toString(); + } + + /** + * The size of the current byte or char slice. + * + * @return The slice size. + */ + public int sliceSize() { + return hasBytes() ? bs.size() : cs.size(); + } + + /** + * Is the active slice the byte slice? + * + * @return true if the active slice is the byte slice. Otherwise, false if the active slice is the char slice. + */ + public boolean hasBytes() { + switch (activeSlice) { + case Bytes: + return true; + case Chars: + return false; + case None: + throw new RuntimeException("Logic error: There is no active slice."); + default: + throw new RuntimeException("Logic error: unhandled case " + activeSlice); + } + } + + /** + * Getter for the byte slice. + */ + public ByteSlice bs() { + return bs; + } + + /** + * Getter for the char slice. + */ + public CharSlice cs() { + return cs; + } + + /** + * Number of items we've consumed so far. This is the number of times {@link #tryMoveNext} has been called and + * returned true. + * + * @return The number of items we've consumed so far + */ + public long numConsumed() { + return numConsumed; + } + + /** + * Is the iteration exhausted? This is set to true when {@link #tryMoveNext} is called and returns false. + * + * @return Whether the iteration is exhausted. + */ + public boolean isExhausted() { + return isExhausted; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/LongParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/LongParser.java new file mode 100644 index 00000000000..7102dad30af --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/LongParser.java @@ -0,0 +1,72 @@ +package io.deephaven.csv.parsers; + +import io.deephaven.csv.parsers.context.SentinelConfiguration; +import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; +import org.apache.commons.lang3.mutable.MutableLong; + +/** + * The parser for the long type. + */ +public final class LongParser implements Parser { + public static final LongParser INSTANCE = new LongParser(); + + private LongParser() {} + + @Override + public Sink tryParse(final ParseContext ctx, final IteratorHolder ih, Sink sink, + long current, final long end) throws CsvReaderException { + final long[] chunk = new long[DEST_BLOCK_SIZE]; + final MutableLong longHolder = new MutableLong(); + final Tokenizer t = ctx.tokenizer; + final Long nullValue = ctx.sentinelConfiguration.nullLongValue; + + final boolean appending = sink == null; + if (appending) { + sink = ctx.sinkFactory.makeLongSink(); + } + + int chunkIndex = 0; + do { + if ((ih.numConsumed() % 10_000_000) == 0) { + System.out.println(this.getClass().getCanonicalName() + ": Processed: " + ih.numConsumed()); + } + if (chunkIndex == chunk.length) { + sink.write(chunk, 0, current, chunkIndex, appending); + current += chunkIndex; + chunkIndex = 0; + } + if (current + chunkIndex == end) { + break; + } + if (ctx.isNullCell(ih)) { + chunk[chunkIndex++] = SentinelConfiguration.assertHasNullValue("long", nullValue); + continue; + } + if (!ih.hasBytes()) { + return null; + } + if (!t.tryParseLong(ih.bs(), longHolder)) { + return null; + } + final long value = longHolder.longValue(); + if (nullValue != null && value == nullValue) { + // If a sentinel null value is defined, it cannot be present in the input. + return null; + } + if (value < -9_999_999 || value > 9_999_999) { + // Not an error, but needed in case we eventually fall back to float. + ctx.hasFewerThan8SigFigsSoFar = false; + } + if (ih.bs().size() > 1) { + // Not an error, but needed in case we eventually fall back to char. + ctx.isNullOrWidthOneSoFar = false; + } + chunk[chunkIndex++] = value; + } while (ih.tryMoveNext()); + sink.write(chunk, 0, current, chunkIndex, appending); + return sink; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/Parser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/Parser.java new file mode 100644 index 00000000000..2573bffbb9b --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/Parser.java @@ -0,0 +1,31 @@ +package io.deephaven.csv.parsers; + +import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.util.CsvReaderException; + +public interface Parser { + /** + * Size of the chunk allocated by the various parsers. + */ + int DEST_BLOCK_SIZE = 65536 * 4; + + /** + * Tries to parse the data pointed to by IteratorHolder 'ih' into a Sink. + * + * @param ctx The ParseContext holding various shared parameters for the parse. + * @param ih An IteratorHolder pointing to the data. It is already pointing to the current element or the end (in + * other words, it has had "tryMoveNext" called on it at least once). The reason for this invariant is + * because other code (controlling logic and other parsers have needed to peek at the current element to + * decide what to do). + * @param sink null if the caller is invoking me in "phase 1" of the parse. Non-null (in fact, this method's + * previous return value) if the caller is invoking this method in "phase 2" of the parse. + * @param current The next destination slot to write to. + * @param end The ending (exclusive) slot to write to. This value might be a large value like Long.MAX_VALUE if the + * caller does not know how many values there are. The logic of this method should always stop when + * {@code ih} is exhausted or when {@code current} reaches {@code end}, whichever comes first. + * @return A Sink<TARRAY> if the parse was successful; null otherwise. + */ + Sink tryParse(ParseContext ctx, IteratorHolder ih, Sink sink, + long current, long end) throws CsvReaderException; +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/Parsers.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/Parsers.java new file mode 100644 index 00000000000..e2930493091 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/Parsers.java @@ -0,0 +1,100 @@ +package io.deephaven.csv.parsers; + +import java.util.ArrayList; +import java.util.List; + +/** + * Standard system parsers for the {@link io.deephaven.csv.reading.CsvReader}. + */ +public class Parsers { + public static final BooleanParser BOOLEAN = BooleanParser.INSTANCE; + public static final ByteParser BYTE = ByteParser.INSTANCE; + public static final ShortParser SHORT = ShortParser.INSTANCE; + public static final IntParser INT = IntParser.INSTANCE; + public static final LongParser LONG = LongParser.INSTANCE; + public static final FloatParser FLOAT = FloatParser.INSTANCE; + public static final DoubleParser DOUBLE = DoubleParser.INSTANCE; + public static final DateTimeAsLongParser DATETIME = DateTimeAsLongParser.INSTANCE; + public static final CharParser CHAR = CharParser.INSTANCE; + public static final StringParser STRING = StringParser.INSTANCE; + public static final TimestampSecondsParser TIMESTAMP_SECONDS = TimestampSecondsParser.INSTANCE; + public static final TimestampMillisParser TIMESTAMP_MILLIS = TimestampMillisParser.INSTANCE; + public static final TimestampMicrosParser TIMESTAMP_MICROS = TimestampMicrosParser.INSTANCE; + public static final TimestampNanosParser TIMESTAMP_NANOS = TimestampNanosParser.INSTANCE; + + public static final List> PRECEDENCE = List.of( + BOOLEAN, + BYTE, + SHORT, + INT, + LONG, + TIMESTAMP_SECONDS, + TIMESTAMP_MILLIS, + TIMESTAMP_MICROS, + TIMESTAMP_NANOS, + FLOAT, + DOUBLE, + DATETIME, + CHAR, + STRING); + + /** + * Notably, BYTE, SHORT, and FLOAT are not in the list of standard parsers. The TIMESTAMP_* parsers are never + * included by default, because they look like ints/longs. + */ + public static final List> DEFAULT = List.of( + BOOLEAN, + INT, + LONG, + DOUBLE, + DATETIME, + CHAR, + STRING); + + /** + * The above, including BYTE, SHORT, and FLOAT. The TIMESTAMP_* parsers are never included by default, because they + * look like ints/longs. + */ + public static final List> COMPLETE = List.of( + BOOLEAN, + BYTE, + SHORT, + INT, + LONG, + FLOAT, + DOUBLE, + DATETIME, + CHAR, + STRING); + + /** + * Minimal + */ + public static final List> MINIMAL = List.of( + DATETIME, + LONG, + DOUBLE, + BOOLEAN, + STRING); + + /** + * Strings only. + */ + public static final List> STRINGS = List.of(STRING); + + public static final List> STANDARD_TIMES_PARSERS = timesBaseParsersAnd(Parsers.TIMESTAMP_SECONDS); + public static final List> STANDARD_MILLITIMES_PARSERS = timesBaseParsersAnd(Parsers.TIMESTAMP_MILLIS); + public static final List> STANDARD_MICROTIMES_PARSERS = timesBaseParsersAnd(Parsers.TIMESTAMP_MICROS); + public static final List> STANDARD_NANOTIMES_PARSERS = timesBaseParsersAnd(Parsers.TIMESTAMP_NANOS); + + private static List> timesBaseParsersAnd(final Parser oneMore) { + final List> result = new ArrayList<>(); + result.add(DATETIME); + result.add(DOUBLE); + result.add(BOOLEAN); + result.add(CHAR); + result.add(STRING); + result.add(oneMore); + return List.copyOf(result); + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/ShortParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/ShortParser.java new file mode 100644 index 00000000000..4e5434382c4 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/ShortParser.java @@ -0,0 +1,68 @@ +package io.deephaven.csv.parsers; + +import io.deephaven.csv.parsers.context.SentinelConfiguration; +import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.tokenization.RangeTests; +import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; +import org.apache.commons.lang3.mutable.MutableLong; + +/** + * The parser for the short type. + */ +public final class ShortParser implements Parser { + public static final ShortParser INSTANCE = new ShortParser(); + + private ShortParser() {} + + @Override + public Sink tryParse(final ParseContext ctx, final IteratorHolder ih, Sink sink, + long current, final long end) throws CsvReaderException { + final short[] chunk = new short[DEST_BLOCK_SIZE]; + final MutableLong longHolder = new MutableLong(); + final Tokenizer t = ctx.tokenizer; + final Short nullValue = ctx.sentinelConfiguration.nullShortValue; + + final boolean appending = sink == null; + if (appending) { + sink = ctx.sinkFactory.makeShortSink(); + } + + int chunkIndex = 0; + do { + if (chunkIndex == chunk.length) { + sink.write(chunk, 0, current, chunkIndex, appending); + current += chunkIndex; + chunkIndex = 0; + } + if (current + chunkIndex == end) { + break; + } + if (ctx.isNullCell(ih)) { + chunk[chunkIndex++] = SentinelConfiguration.assertHasNullValue("short", nullValue); + continue; + } + if (!ih.hasBytes()) { + return null; + } + if (!t.tryParseLong(ih.bs(), longHolder)) { + return null; + } + final long value = longHolder.longValue(); + if (!RangeTests.isInRangeForShort(value)) { + return null; + } + if (nullValue != null && value == nullValue) { + // If a sentinel null value is defined, it cannot be present in the input. + return null; + } + if (ih.bs().size() > 1) { + ctx.isNullOrWidthOneSoFar = false; + } + chunk[chunkIndex++] = (short) value; + } while (ih.tryMoveNext()); + sink.write(chunk, 0, current, chunkIndex, appending); + return sink; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/StringParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/StringParser.java new file mode 100644 index 00000000000..a91aef954b0 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/StringParser.java @@ -0,0 +1,60 @@ +package io.deephaven.csv.parsers; + +import io.deephaven.csv.parsers.context.SentinelConfiguration; +import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; +import org.apache.commons.lang3.mutable.MutableObject; + +/** + * The parser for the String type. + */ +public final class StringParser implements Parser { + public static final StringParser INSTANCE = new StringParser(); + + private StringParser() {} + + @Override + public Sink tryParse(final ParseContext ctx, final IteratorHolder ih, Sink sink, + long current, final long end) throws CsvReaderException { + final String[] chunk = new String[DEST_BLOCK_SIZE]; + final MutableObject nullValue = ctx.sentinelConfiguration.nullStringValue; + + final boolean appending = sink == null; + if (appending) { + sink = ctx.sinkFactory.makeStringSink(); + } + + int chunkIndex = 0; + do { + if (chunkIndex == chunk.length) { + sink.write(chunk, 0, current, chunkIndex, appending); + current += chunkIndex; + chunkIndex = 0; + } + if (current + chunkIndex == end) { + break; + } + if (ctx.isNullCell(ih)) { + // Special case for String: if the null cell literal is the empty string (as it is by default), + // and the user didn't configure a null value sentinel, then when we encounter an empty string, + // just pass it through rather than failing the parse. (This is arguably more user-friendly than + // failing a parse for empty string). + if (nullValue == null && ih.sliceSize() == 0) { + chunk[chunkIndex++] = ""; + continue; + } + chunk[chunkIndex++] = SentinelConfiguration.assertHasNullValue("String", nullValue).getValue(); + continue; + } + final String value = ih.sliceToString(); + if (nullValue != null && value.equals(nullValue.getValue())) { + // If a sentinel null value is defined, it cannot be present in the input. + return null; + } + chunk[chunkIndex++] = value; + } while (ih.tryMoveNext()); + sink.write(chunk, 0, current, chunkIndex, appending); + return sink; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampMicrosParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampMicrosParser.java new file mode 100644 index 00000000000..212b1a925b3 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampMicrosParser.java @@ -0,0 +1,12 @@ +package io.deephaven.csv.parsers; + +/** + * The parser for "microseconds since Unix epoch". + */ +public class TimestampMicrosParser extends TimestampParserBase { + public static final TimestampMicrosParser INSTANCE = new TimestampMicrosParser(); + + private TimestampMicrosParser() { + super(MICROSECOND_SCALE); + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampMillisParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampMillisParser.java new file mode 100644 index 00000000000..34357238aef --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampMillisParser.java @@ -0,0 +1,12 @@ +package io.deephaven.csv.parsers; + +/** + * The parser for "milliseconds since Unix epoch". + */ +public class TimestampMillisParser extends TimestampParserBase { + public static final TimestampMillisParser INSTANCE = new TimestampMillisParser(); + + private TimestampMillisParser() { + super(MILLISECOND_SCALE); + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampNanosParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampNanosParser.java new file mode 100644 index 00000000000..616d04c3e39 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampNanosParser.java @@ -0,0 +1,12 @@ +package io.deephaven.csv.parsers; + +/** + * The parser for "nanoseconds since Unix epoch". + */ +public class TimestampNanosParser extends TimestampParserBase { + public static final TimestampNanosParser INSTANCE = new TimestampNanosParser(); + + private TimestampNanosParser() { + super(NANOSECOND_SCALE); + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampParserBase.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampParserBase.java new file mode 100644 index 00000000000..74a25ee38c3 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampParserBase.java @@ -0,0 +1,90 @@ +package io.deephaven.csv.parsers; + +import io.deephaven.csv.parsers.context.SentinelConfiguration; +import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.util.CsvReaderException; +import org.apache.commons.lang3.mutable.MutableLong; + +/** + * The base class for various timestamp parsers. These parsers parse longs, scale them by some appropriate value, and + * then feed them to the sink for the Deephaven DateTime (as long) type. + */ +public abstract class TimestampParserBase implements Parser { + protected static final long SECOND_SCALE = 1_000_000_000; + protected static final long MILLISECOND_SCALE = 1_000_000; + protected static final long MICROSECOND_SCALE = 1_000; + protected static final long NANOSECOND_SCALE = 1; + + private final long scale; + private final long minValue; + private final long maxValue; + + /** + * @param scale: 1 for seconds, 1000 for millis, 1_000_000 for micros, 1_000_000_000 for nanos + */ + protected TimestampParserBase(long scale) { + this.scale = scale; + minValue = Long.MIN_VALUE / scale; + maxValue = Long.MAX_VALUE / scale; + } + + @Override + public Sink tryParse(final ParseContext ctx, final IteratorHolder ih, Sink sink, + long current, final long end) throws CsvReaderException { + final long[] chunk = new long[DEST_BLOCK_SIZE]; + final MutableLong longHolder = new MutableLong(); + final Tokenizer t = ctx.tokenizer; + final Long nullValue = ctx.sentinelConfiguration.nullTimestampAsLongValue; + + final boolean appending = sink == null; + if (appending) { + sink = ctx.sinkFactory.makeTimestampAsLongSink(); + } + + int chunkIndex = 0; + do { + if (chunkIndex == chunk.length) { + sink.write(chunk, 0, current, chunkIndex, appending); + current += chunkIndex; + chunkIndex = 0; + } + if (current + chunkIndex == end) { + break; + } + if (ctx.isNullCell(ih)) { + chunk[chunkIndex++] = SentinelConfiguration.assertHasNullValue("timestamp", nullValue); + continue; + } + if (!ih.hasBytes()) { + return null; + } + if (!t.tryParseLong(ih.bs(), longHolder)) { + return null; + } + final long value = longHolder.longValue(); + if (value < minValue || value > maxValue) { + return null; + } + if (nullValue != null && value == nullValue) { + // If a sentinel null value is defined, it cannot be present in the input. + return null; + } + // TODO(kosak): this is a cheap test but is not really fair, as there are + // certainly *some* large longs (e.g. powers of two) that are easily + // representable as float. It may be good enough however, and may more closely + // represent the input's "intent". + if (value < -9_999_999 || value > 9_999_999) { + // Not an error, but needed in case we eventually fall back to float. + ctx.hasFewerThan8SigFigsSoFar = false; + } + if (ih.bs().size() > 1) { + ctx.isNullOrWidthOneSoFar = false; + } + chunk[chunkIndex++] = value * scale; + } while (ih.tryMoveNext()); + sink.write(chunk, 0, current, chunkIndex, appending); + return sink; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampSecondsParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampSecondsParser.java new file mode 100644 index 00000000000..69949aa5caa --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampSecondsParser.java @@ -0,0 +1,12 @@ +package io.deephaven.csv.parsers; + +/** + * The parser for "seconds since Unix epoch". + */ +public final class TimestampSecondsParser extends TimestampParserBase { + public static final TimestampSecondsParser INSTANCE = new TimestampSecondsParser(); + + private TimestampSecondsParser() { + super(SECOND_SCALE); + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/context/ParseContext.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/context/ParseContext.java new file mode 100644 index 00000000000..10aabc5ee04 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/context/ParseContext.java @@ -0,0 +1,104 @@ +package io.deephaven.csv.parsers.context; + +import io.deephaven.csv.parsers.IteratorHolder; +import io.deephaven.csv.sinks.SinkFactory; +import io.deephaven.csv.tokenization.RangeTests; +import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.util.CsvReaderException; + +import java.util.Arrays; + +/** + * Shared state (some of it mutable) for the parsers participating in type inference for a given column. + */ +public final class ParseContext { + /** + * The Tokenizer is responsible for parsing entities like ints, doubles, supported DateTime formats, etc. + */ + public final Tokenizer tokenizer; + /** + * Caller-specified interface for making all the various Sink<TARRAY> types. + */ + public final SinkFactory sinkFactory; + /** + * The settings for the various null sentinels. + */ + public final SentinelConfiguration sentinelConfiguration; + /** + * This is a bit of a hack which allows the FloatParser to decide what to do when it encounters a value with more + * significant digits than would fit in a float. If it knows a double parser is available, the float parser will + * fail (and let the double parser handle the value). Otherwise (if a double parser is not available), it will just + * accept the number and truncate. + */ + public final boolean doubleParserIsAvailable; + /** + * Whether all the cells seen so far have fewer than 8 significant figures. This is used when inferring float vs + * long. + */ + public boolean hasFewerThan8SigFigsSoFar; + /** + * Whether all the cells seen so far are the "null" indicator (usually the empty string), or are 1 character in + * length. This is used when inferring char vs String. + */ + public boolean isNullOrWidthOneSoFar; + /** + * If the null sentinel is not the empty string, and can be represented as bytes, then this field contains the bytes + * of the null sentinel string. Otherwise this field contains null. + */ + private final byte[] nullSentinelBytes; + /** + * If the null sentinel is not the empty string, and can be represented as chars but not bytes, then this field + * contains the chars of the null sentinel string. Otherwise this field contains null. + */ + private final char[] nullSentinelChars; + + public ParseContext(final Tokenizer tokenizer, final SinkFactory sinkFactory, + final SentinelConfiguration sentinelConfiguration, + String nullValueLiteral, final boolean doubleParserIsAvailable) { + this.tokenizer = tokenizer; + this.sinkFactory = sinkFactory; + this.sentinelConfiguration = sentinelConfiguration; + this.doubleParserIsAvailable = doubleParserIsAvailable; + hasFewerThan8SigFigsSoFar = true; + isNullOrWidthOneSoFar = true; + + // Process the nullValueLiteral into a byte array, a char array, or neither, so the isNullCell test + // can run quickly. + byte[] nsb = null; + char[] nsc = null; + if (!nullValueLiteral.isEmpty()) { + final char[] data = nullValueLiteral.toCharArray(); + if (RangeTests.isByteRepresentable(data, 0, data.length)) { + nsb = new byte[data.length]; + for (int ii = 0; ii < data.length; ++ii) { + nsb[ii] = (byte) data[ii]; + } + } else { + nsc = data; + } + } + nullSentinelBytes = nsb; + nullSentinelChars = nsc; + } + + /** + * Determines whether the iterator's current text contains the null value literal. The notion of "null value + * literal" is user-configurable on a per-column basis, but is typically the empty string. + * + * @return whether the iterator's current text contains the null cell. + */ + public boolean isNullCell(IteratorHolder ih) { + if (nullSentinelBytes != null) { + return ih.hasBytes() && + Arrays.equals(ih.bs().data(), ih.bs().begin(), ih.bs().end(), + nullSentinelBytes, 0, nullSentinelBytes.length); + } + if (nullSentinelChars != null) { + return !ih.hasBytes() && + Arrays.equals(ih.cs().data(), ih.cs().begin(), ih.cs().end(), + nullSentinelChars, 0, nullSentinelChars.length); + } + // If both arrays are null, then the sentinel text is the empty string. + return ih.sliceSize() == 0; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/context/SentinelConfiguration.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/context/SentinelConfiguration.java new file mode 100644 index 00000000000..d2b693dcf34 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/parsers/context/SentinelConfiguration.java @@ -0,0 +1,158 @@ +package io.deephaven.csv.parsers.context; + +import io.deephaven.csv.util.CsvReaderException; +import org.apache.commons.lang3.mutable.MutableObject; + +/** + *

    + * This class holds the configuration for the optional sentinel values for each type. This is useful for systems where + * there is a special distinguished value taken from the range of values meant to be interpreted as null. For example, + * in the Deephaven system, Long.MIN_VALUE is not a valid long value in the system; rather it is taken to mean "null". + *

    + * + *

    + * The library also defines a notion of "null literal" which can be configured on a global or per-column basis. When the + * reader encounters that special string, it will interpret it as the null sentinel for the given type. The default null + * literal is the empty string. + *

    + * + *

    + * When the reader encounters the null literal defined for a column, the library will consult the values here to see if + * there is a sentinel value for that type. If there is no such sentinel value configured, then it is an error to have a + * null literal in the input. Conversely, if there is a sentinel value configured here, then the system will refuse to + * read a literal of that type that parses to that sentinel value. + *

    + * + *

    + * The rule is loosened for {@link String}: if the parser encounters an empty String AND empty String is the null value + * sentinel BUT there is no null Sentinel value configured for the String type, then the String parser will just pass + * through the empty string rather than complaining that there is no null value configured. + *

    + * + *

    + * This rule interacts with type inference: say you have configured nullIntValue to be some value (say, + * {@link Integer#MIN_VALUE}). If the system is reading a column of ints, and you have "-2147483648" in the input, then + * the system will conclude that it cannot parse the column as int and it will try the next configured parser + * (typically, the parser for long). Each parser is configured this way (to punt to the next). If type inferencing has + * led you to the "last" numeric parser, namely double, and you encounter the nullDoubleValue, then the system would + * interpret your whole column as strings. + *

    + * + *

    + * Consider the following input: + *

    + *
      + *
    • 3
    • + *
    • [blank line]
    • + *
    • 5
    • + *
    + * + *

    + * Example 1: + *

    + *
      + *
    1. null value literal configured to be "".
    2. + *
    3. null value sentinel for int is configured to be -2147483648
    4. + *
    + * + *

    + * The above input would be parsed as integers as 3, -2147483648, 5. + *

    + * + *

    + * Example 2: + *

    + *
      + *
    1. null value literal configured to be "NULL".
    2. + *
    3. null value sentinel for int is configured to be -2147483648
    4. + *
    + * + *

    + * Because integer parsing will fail on the blank line, the above input would be parsed as strings as "3", "", + * "5". + *

    + * + *

    + * Now consider the following input: + *

    + *
      + *
    • 3
    • + *
    • -2147483648
    • + *
    • 5
    • + *
    + * + *

    + * Example 3: + *

    + *
      + *
    1. null value literal configured to be "".
    2. + *
    3. null value sentinel for int is configured to be -2147483648
    4. + *
    + * + *

    + * Integer parsing will fail here because the null value sentinel -2147483648 appears in the input, but the null value + * literal is something other than "-2147483648". When this happens, the integer parser will punt to the next parser in + * the sequence, which is typically long. So above input would be parsed as longs as 3, -2147483648, 5. (assuming that + * the null sentinel value for long is unset, or something other than 3, -2147483648, or 5). + *

    + */ +public class SentinelConfiguration { + /** + * null means "not configured". + */ + public Byte nullBooleanAsByteValue; + /** + * null means "not configured". + */ + public Byte nullByteValue; + /** + * null means "not configured". + */ + public Short nullShortValue; + /** + * null means "not configured". + */ + public Integer nullIntValue; + /** + * null means "not configured". + */ + public Long nullLongValue; + /** + * null means "not configured". + */ + public Float nullFloatValue; + /** + * null means "not configured". + */ + public Double nullDoubleValue; + /** + * null means "not configured". + */ + public Character nullCharValue; + /** + * We need to represent the concepts of "null value not configured" as well as "null value configured and is null". + * I'd like to use Optional<String> here but Optional can't distinguish a null value from an unset optional. + * So we use MutableObject<String> instead. A null MutableObject means "not configured". A non-null + * MutableObject (whose configured value may indeed be the a null String) means "configured". + */ + public MutableObject nullStringValue; + /** + * null means "not configured". + */ + public Long nullDateTimeAsLongValue; + /** + * null means "not configured". + */ + public Long nullTimestampAsLongValue; + + + /** + * Utility method for parsers which throws a nice exception when a given null sentinel value is not defined. + */ + public static T assertHasNullValue(String what, T boxedValue) throws CsvReaderException { + if (boxedValue != null) { + return boxedValue; + } + throw new CsvReaderException("Encountered a null cell but no null value for " + what + " was configured."); + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/reading/CellGrabber.java b/extensions/csv/src/main/java/io/deephaven/csv/reading/CellGrabber.java new file mode 100644 index 00000000000..267182bd907 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/reading/CellGrabber.java @@ -0,0 +1,324 @@ +package io.deephaven.csv.reading; + +import io.deephaven.csv.containers.GrowableCharBuffer; +import io.deephaven.csv.tokenization.RangeTests; +import io.deephaven.csv.containers.CharSlice; +import io.deephaven.csv.util.CsvReaderException; +import org.apache.commons.lang3.mutable.MutableBoolean; + +import java.io.IOException; +import java.io.Reader; + +/** + * This class is used to traverse over text from a Reader, understanding both field and line delimiters, as well as the + * CSV quoting convention, and breaking the text into cells for use by the calling code. + */ +final class CellGrabber { + /** + * Size of chunks to read from the {@link Reader}. + */ + private static final int BUFFER_SIZE = 65536; + /** + * The {@link Reader}reader for the input. + */ + private final Reader reader; + /** + * The configured CVS quote character (typically '"'). + */ + private final char quoteChar; + /** + * The configured CVS field delimiter (typically ','). + */ + private final char fieldDelimiter; + /** + * Whether to trim leading and trailing blanks from non-quoted values. + */ + private final boolean ignoreSurroundingSpaces; + /** + * Whether to trim leading and trailing blanks from inside quoted values. + */ + private final boolean trim; + /** + * The current chunk we have read from the file. + */ + private final char[] buffer; + /** + * Size of the last buffer chunk read. + */ + private int size; + /** + * Current offset in the buffer chunk. + */ + private int offset; + /** + * Starting offset of a contiguous span of characters we are scanning from the buffer chunk. + */ + private int startOffset; + /** + * A side buffer we have to use for edge cases. Normally we try to return a CharSlice which shares our buffer[] + * array. But we can't do that when the input cell spans more than one buffer[] chunk, or when the input cell does + * not exactly represent the output. For example an escaped quote ("") needs to be returned as a single quotation + * mark. So if our input is hello""there, then we can't return directly return a slice of the input array, because + * we need hello"there. + */ + private final GrowableCharBuffer spillBuffer; + + /** + * Constructor. + */ + public CellGrabber(final Reader reader, final char quoteChar, final char fieldDelimiter, + final boolean ignoreSurroundingSpaces, final boolean trim) { + this.reader = reader; + this.quoteChar = quoteChar; + this.fieldDelimiter = fieldDelimiter; + this.ignoreSurroundingSpaces = ignoreSurroundingSpaces; + this.trim = trim; + this.buffer = new char[BUFFER_SIZE]; + this.size = 0; + this.offset = 0; + this.startOffset = 0; + this.spillBuffer = new GrowableCharBuffer(); + } + + /** + * Try to grab the next cell from the input, being aware of field delimiters, line delimiters, quoting, and + * trimming. + * + * @param dest The result, as a {@link CharSlice}. The CharSlice is invalidated by the next call to grabNext. + * @param lastInRow An out parameter whose contents are only specified if this method returns true. Its contents + * will be set to true if the cell just read was the last cell in the row, otherwise they will be set to + * false. + * @return true if a cell was read; false if at end of input. + */ + public boolean grabNext(final CharSlice dest, final MutableBoolean lastInRow) throws CsvReaderException { + spillBuffer.clear(); + startOffset = offset; + + if (ignoreSurroundingSpaces) { + skipWhitespace(); + } + if (!tryEnsureMore()) { + return false; + } + + // Is first char the quote char? + if (buffer[offset] == quoteChar) { + ++offset; + processQuotedMode(dest, lastInRow); + if (trim) { + trimWhitespace(dest); + } + } else { + processUnquotedMode(dest, lastInRow); + } + return true; + } + + /** + * Process characters in "quoted mode". This involves some trickery to deal with quoted quotes and the end quote. + * + * @param lastInRow An out parameter. Its contents will be set to true if the cell just read was the last cell in + * the row, otherwise they will be set to false. + */ + private void processQuotedMode(final CharSlice dest, final MutableBoolean lastInRow) throws CsvReaderException { + startOffset = offset; + while (true) { + if (offset == size) { + if (!tryEnsureMore()) { + throw new CsvReaderException("Cell did not have closing quote character"); + } + } + final char ch = buffer[offset++]; + if (ch != quoteChar) { + // Ordinary character. Note: in quoted mode we will gladly eat field and line separators. + continue; + } + // This character is a quote char. It could be the end of the cell, or it could be an escaped + // quote char (e.g. ""). The way to tell is to peek ahead at the next character. + if (!tryEnsureMore()) { + // There is no next char (we are at end of input), so let's call this end of cell. + break; + } + final char peek = buffer[offset]; + if (peek != quoteChar) { + // There is a next char, but it's not a quotation mark. So this + // quotation mark must be the end of the quoted string. + break; + } + // There is a next character, and it *is* a quotation mark. So this is a quoted quote + // "", to be interpreted as ". So we'll spill this string (up to the first quotation mark), + // skip the second quotation mark, and keep going. + spillRange(); + // Skip the second quotation mark. + ++offset; + startOffset = offset; + } + // We got out of the quoted string. Consume any trailing matter after the quote and before the field + // delimiter. Hopefully that trailing matter is just whitespace, but we shall see. + finishField(dest, lastInRow); + + // From this point on, note that dest is a slice that may point to the underlying input buffer or the spill + // buffer. Take care from this point on to not disturb the input (e.g. by reading the next chunk) or the + // spill buffer. + + // The easiest way to make all the above logic run smoothly is to let the final quotation mark + // (which will unconditionally be there) and subsequent whitespace (if any) into the field. + // Then we can simply trim it back out now. + while (dest.begin() != dest.end() && RangeTests.isWhitespace(dest.back())) { + dest.setEnd(dest.end() - 1); + } + if (dest.begin() == dest.end() || dest.back() != quoteChar) { + throw new RuntimeException("Logic error: final non-whitespace in field is not quotation mark"); + } + dest.setEnd(dest.end() - 1); + } + + /** + * Process characters in "unquoted mode". This is easy: eat characters until the next field or line delimiter. + */ + private void processUnquotedMode(final CharSlice dest, final MutableBoolean lastInRow) throws CsvReaderException { + startOffset = offset; + finishField(dest, lastInRow); + } + + /** + * Skip whitespace but do not consider the field delimiter to be whitespace. + */ + private void skipWhitespace() throws CsvReaderException { + while (true) { + if (offset == size) { + if (!tryEnsureMore()) { + return; + } + } + final char ch = buffer[offset]; + if (ch == fieldDelimiter || !RangeTests.isWhitespace(ch)) { + return; + } + ++offset; + } + } + + /** + * Eat characters until the next field or line delimiter. + * + * @param lastInRow An out parameter. Its contents are set to true if the cell was the last one in the row. + * Otherwise, its contents are set to false. + */ + private void finishField(final CharSlice dest, final MutableBoolean lastInRow) throws CsvReaderException { + while (true) { + if (offset == size) { + if (!tryEnsureMore()) { + finish(dest); + // End of file sets last in row. + lastInRow.setValue(true); + return; + } + } + final char ch = buffer[offset]; + if (ch == fieldDelimiter) { + finish(dest); + ++offset; // ... and skip over the field delimiter. + lastInRow.setValue(false); + return; + } + if (ch == '\n') { + finish(dest); + ++offset; + lastInRow.setValue(true); + return; + } + if (ch == '\r') { + finish(dest); + ++offset; + if (tryEnsureMore()) { + // might be \r\n + if (buffer[offset] == '\n') { + ++offset; + } + } + lastInRow.setValue(true); + return; + } + ++offset; + } + } + + /** + * @return true if there are more characters. + */ + private boolean tryEnsureMore() throws CsvReaderException { + if (offset != size) { + return true; + } + spillRange(); + refillBuffer(); + return size != 0; + } + + /** + * Spill the current range to the spillBuffer. Normally we try to stay in the "common case", where the entire cell + * we are reading is consecutive characters in the underlying input buffer. This assumption fails when either there + * are escaped quotes (like "" needing to be interpreted as "), or when the cell we are reading spans the boundaries + * of two input buffers. In that case we "spill" the characters we have collected so far to the spillBuffer. + */ + private void spillRange() { + spillBuffer.append(buffer, startOffset, offset - startOffset); + startOffset = offset; + } + + /** + * Get another chunk of data from the Reader. + */ + private void refillBuffer() throws CsvReaderException { + offset = 0; + startOffset = 0; + try { + while (true) { + final int charsRead = reader.read(buffer); + if (charsRead < 0) { + size = 0; + return; + } + if (charsRead > 0) { + size = charsRead; + return; + } + // Zero-length reads are not expected to happen, but if they do, just keep trying. + } + } catch (IOException inner) { + throw new CsvReaderException("Caught exception", inner); + } + } + + private void finish(final CharSlice dest) { + if (spillBuffer.size() == 0) { + // If we never spilled then our whole output is in the input buffer. So we can + // just return a slice of the input buffer. + dest.reset(buffer, startOffset, offset); + return; + } + // Otherwise, append we need to append whatever residual is left to spillBuffer + // and return a slice of spillBuffer. + spillRange(); + dest.reset(spillBuffer.data(), 0, spillBuffer.size()); + } + + /** + * Trim whitespace from the front and back of the slice. + * + * @param cs The slice, modified in-place to have whitespace (if any) removed. + */ + private static void trimWhitespace(final CharSlice cs) { + final char[] data = cs.data(); + int begin = cs.begin(); + int end = cs.end(); + while (begin != end && RangeTests.isWhitespace(data[begin])) { + ++begin; + } + while (begin != end && RangeTests.isWhitespace(data[end - 1])) { + --end; + } + cs.reset(data, begin, end); + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/reading/CsvReader.java b/extensions/csv/src/main/java/io/deephaven/csv/reading/CsvReader.java new file mode 100644 index 00000000000..d7d3d439e39 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/reading/CsvReader.java @@ -0,0 +1,558 @@ +package io.deephaven.csv.reading; + +import io.deephaven.csv.densestorage.DenseStorageReader; +import io.deephaven.csv.densestorage.DenseStorageWriter; +import io.deephaven.csv.parsers.Parser; +import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.parsers.context.SentinelConfiguration; +import io.deephaven.csv.parsers.Parsers; +import io.deephaven.csv.sinks.SinkFactory; +import io.deephaven.csv.containers.CharSlice; +import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.util.CsvReaderException; +import io.deephaven.csv.util.TimeLogger; +import org.apache.commons.lang3.mutable.MutableBoolean; +import org.apache.commons.lang3.mutable.MutableObject; + +import java.io.Reader; +import java.util.*; +import java.util.concurrent.*; + +/** + * A class for reading CSV data. Typical usage is: + *
      + *
    1. Construct a CsvReader.
    2. + *
    3. Customize the CsvReader by calling the various setXXX methods.
    4. + *
    5. Arrange for the input text to be in a {@link Reader}.
    6. + *
    7. Prepare a {@link SinkFactory} which can in turn provide Sink<T> objects for the output data.
    8. + *
    9. Call the {@link #read} method.
    10. + *
    + * Furthermore the setXXX methods can be used in a builder pattern. Example: + * + *
    + * final CsvReader csvr = new CsvReader()
    + *   .setQuoteChar('#')
    + *   .setAsync(false)
    + *   .setParserFor("Timestamp", Parsers.DATETIME);
    + * final Reader r = ...;
    + * final SinkFactory f = ...;
    + * final CsvReader.Result res = csvr.read(r, f);
    + * 
    + */ +public final class CsvReader { + /** + * Whether to trim leading and trailing blanks from non-quoted values. + */ + private boolean ignoreSurroundingSpaces = false; + /** + * Whether to trim leading and trailing blanks from inside quoted values. + */ + private boolean trim = false; + /** + * Whether the incoming data has column headers. + */ + private boolean hasHeaders = true; + /** + * The quote character (used when you want field or line delimiters to be interpreted as literal text. For example: + * + *
    +     * 123,"hello, there",456,
    +     * 
    + * + * Would be read as the three fields: + *
      + *
    • 123
    • + *
    • hello, there
    • + *
    • 456
    • + *
    + */ + private char quoteChar = '"'; + /** + * The field delimiter (the character that separates one column from the next. + */ + private char fieldDelimiter = ','; + /** + * Whether to run concurrently. In particular, the operation of reading the raw file, breaking it into columns, and + * storing that column text in memory can run in parallel with parsing the data for a given column, and all the + * column data parsers can themselves run in parallel. + */ + private boolean concurrent = true; + /** + * The user-defined set of parsers that participate in type inference. Defaults to Parsers.DEFAULT + */ + private List> parsers = new ArrayList<>(Parsers.DEFAULT); + + /** + * Client-specified headers that can be used to override the existing headers in the input (if hasHeaders is true), + * or to provide absent headers (if hasHeaders is false). + */ + private List clientSpecifiedHeaders = new ArrayList<>(); + /** + * Used to force a specific parser for a specific column, specified by column name. + */ + private final Map> parsersByColumnName = new HashMap<>(); + /** + * Used to force a specific parser for a specific column, specified by column number. Column numbers start with 1. + */ + private final Map> parsersByColumnNumber = new HashMap<>(); + /** + * The default string that means "null value" in the input. It is used if not overridden on a per-column basis. It + * defaults to the empty string. + */ + private String nullValueLiteral = ""; + /** + * Used to force a specific parser for a specific column, specified by column name. + */ + private final Map nullValueLiteralByColumnName = new HashMap<>(); + /** + * Used to force a specific parser for a specific column, specified by column number. Column numbers start with 1. + */ + private final Map nullValueLiteralByColumnNumber = new HashMap<>(); + /** + * The parser to be used when a column is entirely null (unless a specific parser has been forced by setting an + * entry in the parsers collection. + */ + private Parser nullParser; + /** + * An optional low-level parser that understands custom time zones. + */ + private Tokenizer.CustomTimeZoneParser customTimeZoneParser; + /** + * The SentinelConfiguration holding various optional null sentinels. + */ + private final SentinelConfiguration sentinelConfiguration = new SentinelConfiguration(); + + /** + * Read the data. + * + * @param reader The input data + * @param sinkFactory A factory that can provide Sink<T> of all appropriate types for the output data. Once + * the CsvReader determines what the column type is, t will use the SinkFactory to create an appropriate + * Sink<T> for the type. Note that the CsvReader might guess wrong, so it might create a Sink, + * partially populate it, and then abandon it. The final set of fully-populated Sinks will be returned in in + * the CsvReader.Result. + * @return A CsvReader.Result containing the column names, the number of columns, and the final set of + * fully-populated Sinks. + */ + public Result read(final Reader reader, final SinkFactory sinkFactory) throws CsvReaderException { + final CellGrabber grabber = new CellGrabber(reader, quoteChar, fieldDelimiter, ignoreSurroundingSpaces, trim); + // For an "out" parameter + MutableObject firstDataRowHolder = new MutableObject<>(); + final String[] headersToUse = determineHeadersToUse(grabber, firstDataRowHolder); + final String[] firstDataRow = firstDataRowHolder.getValue(); + final int numCols = headersToUse.length; + + // Create a DenseStorageWriter and two readers for each column. + final DenseStorageWriter[] dsws = new DenseStorageWriter[numCols]; + final DenseStorageReader[] dsr0s = new DenseStorageReader[numCols]; + final DenseStorageReader[] dsr1s = new DenseStorageReader[numCols]; + for (int ii = 0; ii < numCols; ++ii) { + final DenseStorageWriter dsw = new DenseStorageWriter(); + dsws[ii] = dsw; + dsr0s[ii] = dsw.newReader(); + dsr1s[ii] = dsw.newReader(); + } + + // Select an Excecutor based on whether the user wants the code to run asynchronously + // or not. + final ExecutorService exec = + concurrent ? Executors.newFixedThreadPool(numCols + 1) : Executors.newSingleThreadExecutor(); + + final Future numRowsFuture = exec.submit( + () -> ParseInputToDenseStorage.doit(firstDataRow, grabber, dsws)); + + + final ArrayList>> sinkFutures = new ArrayList<>(); + + for (int ii = 0; ii < numCols; ++ii) { + final List> parsersToUse = calcParsersToUse(headersToUse[ii], ii + 1); + final String nullValueLiteralToUse = calcNullValueLiteralToUse(headersToUse[ii], ii + 1); + + final int iiCopy = ii; + final Future> fcb = exec.submit( + () -> ParseDenseStorageToColumn.doit(dsr0s[iiCopy], dsr1s[iiCopy], + parsersToUse, nullParser, customTimeZoneParser, + sentinelConfiguration, nullValueLiteralToUse, sinkFactory)); + sinkFutures.add(fcb); + } + + final long numRows; + final Sink[] sinks = new Sink[numCols]; + try { + numRows = numRowsFuture.get(); + for (int ii = 0; ii < numCols; ++ii) { + sinks[ii] = sinkFutures.get(ii).get(); + } + } catch (Exception inner) { + throw new CsvReaderException("Caught exception", inner); + } + + return new Result(numRows, headersToUse, sinks); + } + + /** + * Determine which list of parsers to use for type inference. Returns {@link #parsers} unless the user has set an + * override on a column name or column number basis. + */ + private List> calcParsersToUse(final String columnName, final int oneBasedColumnNumber) { + Parser specifiedParser = parsersByColumnName.get(columnName); + if (specifiedParser != null) { + return List.of(specifiedParser); + } + specifiedParser = parsersByColumnNumber.get(oneBasedColumnNumber); + if (specifiedParser != null) { + return List.of(specifiedParser); + } + return parsers; + } + + /** + * Determine which null value literal to use. Returns {@link #nullValueLiteral} unless the user has set an override + * on a column name or column number basis. + */ + private String calcNullValueLiteralToUse(final String columnName, final int oneBasedColumnNumber) { + String result = nullValueLiteralByColumnName.get(columnName); + if (result != null) { + return result; + } + result = nullValueLiteralByColumnNumber.get(oneBasedColumnNumber); + if (result != null) { + return result; + } + return nullValueLiteral; + } + + /** + * Determine which headers to use. The result comes from either the first row of the file or the user-specified + * overrides. + */ + private String[] determineHeadersToUse(final CellGrabber grabber, final MutableObject firstDataRowHolder) + throws CsvReaderException { + String[] headersToUse = null; + final String[] firstDataRow; + if (hasHeaders) { + headersToUse = tryReadOneRow(grabber); + if (headersToUse == null) { + throw new CsvReaderException("Can't proceed because hasHeaders is set but input file is empty"); + } + } + + // Whether or not the input had headers, maybe override with client-specified headers. + if (clientSpecifiedHeaders.size() != 0) { + headersToUse = clientSpecifiedHeaders.toArray(new String[0]); + } + + // If we still have nothing, try generate synthetic column headers (works only if the file is non-empty, + // because we need to infer the column count). + if (headersToUse == null) { + firstDataRow = tryReadOneRow(grabber); + if (firstDataRow == null) { + throw new CsvReaderException( + "Can't proceed because input file is empty and client has not specified headers"); + } + headersToUse = new String[firstDataRow.length]; + for (int ii = 0; ii < headersToUse.length; ++ii) { + headersToUse[ii] = "Column" + (ii + 1); + } + } else { + firstDataRow = null; + } + firstDataRowHolder.setValue(firstDataRow); + return headersToUse; + } + + /** + * Try to read one row from the input. Returns false if the input ends before one row has been read. TODO(kosak): + * should probably deal more gracefully when the input doesn't end with a newline. + * + * @return The first row as a String[] or null if the input was exhausted. + */ + private static String[] tryReadOneRow(final CellGrabber grabber) throws CsvReaderException { + final List headers = new ArrayList<>(); + + // Grab the header + final CharSlice slice = new CharSlice(); + final MutableBoolean lastInRow = new MutableBoolean(); + do { + if (!grabber.grabNext(slice, lastInRow)) { + return null; + } + headers.add(slice.toString()); + } while (!lastInRow.booleanValue()); + return headers.toArray(new String[0]); + } + + /** + * Sets whether to trim leading and trailing blanks from non-quoted values. This really only matters for columns + * that are inferred to be of type String. Numeric columns ignore surrounding whitespace regardless of this setting. + */ + public CsvReader setIgnoreSurroundingSpaces(final boolean value) { + ignoreSurroundingSpaces = value; + return this; + } + + /** + * Sets whether to trim leading and trailing blanks from inside quoted values. This really only matters for columns + * that are inferred to be of type String. Numeric columns ignore surrounding whitespace regardless of this setting. + */ + public CsvReader setTrim(final boolean value) { + trim = value; + return this; + } + + /** + * Sets whether the first row of the input is column headers. + */ + public CsvReader setHasHeaders(final boolean value) { + hasHeaders = value; + return this; + } + + /** + * Sets the field delimiter. Typically the comma or tab character. + */ + public CsvReader setFieldDelimiter(final char value) { + fieldDelimiter = value; + return this; + } + + /** + * Sets the quote character. Used by the input when it needs to escape special characters like field or line + * delimiters. A doubled quote character represents itself. Examples (assuming the quote character is set to '"'): + *
      + *
    • "Hello, there": the string Hello, there
    • + *
    • "Hello""there": the string Hello"there
    • + *
    • """": the string "
    • + *
    + */ + public CsvReader setquoteChar(final char value) { + quoteChar = value; + return this; + } + + /** + * Whether the reader should run the file tokenizer and column parsing jobs concurrently, using multiple threads. + * This typically yields better performance. + */ + public CsvReader setConcurrent(final boolean value) { + this.concurrent = value; + return this; + } + + /** + * Set the list of parsers participating in type inference. + */ + public CsvReader setParsers(final List> parsers) { + this.parsers = new ArrayList<>(parsers); + return this; + } + + /** + * Add parsers to the existing list of parsers participating in type inference. + */ + public CsvReader addParsers(Parser... parsers) { + this.parsers.addAll(List.of(parsers)); + return this; + } + + /** + * Overrides (if hasHeaders is true) or provides (if hasHeaders is false) the column headers. + */ + public CsvReader setHeaders(final Collection headers) { + clientSpecifiedHeaders = new ArrayList<>(headers); + return this; + } + + /** + * Specify a parser for a given column name, rather than using inference to pick a type. + */ + public CsvReader setParserFor(final String name, final Parser parser) { + this.parsersByColumnName.put(name, parser); + return this; + } + + /** + * Specify a parser for a given column number, rather than using inference to pick a type. The column numbers are + * 1-based. + */ + public CsvReader setParserFor(final int columnNumber, final Parser parser) { + this.parsersByColumnNumber.put(columnNumber, parser); + return this; + } + + /** + * Specify the default null value literal to be used if not overridden for a column. + */ + public CsvReader setNullValueLiteral(final String nullValueLiteral) { + this.nullValueLiteral = nullValueLiteral; + return this; + } + + /** + * Specify the null value literal for a given column name. + */ + public CsvReader setNullValueLiteralFor(final String name, final String nullValueLiteral) { + this.nullValueLiteralByColumnName.put(name, nullValueLiteral); + return this; + } + + /** + * Specify a parser for a given column number, rather than using inference to pick a type. The column numbers are + * 1-based. + */ + public CsvReader setNullValueLiteralFor(final int columnNumber, final String nullValueLiteral) { + this.nullValueLiteralByColumnNumber.put(columnNumber, nullValueLiteral); + return this; + } + + /** + * Specify the parser to be used for columns that contain all nulls. (Unless that column has a parser specified by + * {@link #setParserFor}. + */ + public CsvReader setNullParser(final Parser nullParser) { + this.nullParser = nullParser; + return this; + } + + /** + * Specify a plugin to be used to parse custom time zones. This permits the caller to support custom time zones such + * as the " NY" that appears in "2020-05-05 12:34:56 NY". The first digit (here, space) must be something other than + * "Z". + */ + public CsvReader setCustomTimeZoneParser(Tokenizer.CustomTimeZoneParser customTimeZoneParser) { + this.customTimeZoneParser = customTimeZoneParser; + return this; + } + + /** + * Specify a sentinel value used to represent null booleans. + */ + public CsvReader setNullBooleanAsByteValue(final byte nullValue) { + sentinelConfiguration.nullBooleanAsByteValue = nullValue; + return this; + } + + /** + * Specify a sentinel value used to represent null bytes. + */ + public CsvReader setNullByteValue(final byte nullValue) { + sentinelConfiguration.nullByteValue = nullValue; + return this; + } + + /** + * Specify a sentinel value used to represent null shorts. + */ + public CsvReader setNullShortValue(final short nullValue) { + sentinelConfiguration.nullShortValue = nullValue; + return this; + } + + /** + * Specify a sentinel value used to represent null ints. + */ + public CsvReader setNullIntValue(final int nullValue) { + sentinelConfiguration.nullIntValue = nullValue; + return this; + } + + /** + * Specify a sentinel value used to represent null longs. + */ + public CsvReader setNullLongValue(final long nullValue) { + sentinelConfiguration.nullLongValue = nullValue; + return this; + } + + /** + * Specify a sentinel value used to represent null floats. + */ + public CsvReader setNullFloatValue(final float nullValue) { + sentinelConfiguration.nullFloatValue = nullValue; + return this; + } + + /** + * Specify a sentinel value used to represent null doubles. + */ + public CsvReader setNullDoubleValue(final double nullValue) { + sentinelConfiguration.nullDoubleValue = nullValue; + return this; + } + + /** + * Specify a sentinel value used to represent null chars. + */ + public CsvReader setNullCharValue(final char nullValue) { + sentinelConfiguration.nullCharValue = nullValue; + return this; + } + + /** + * Specify a sentinel value used to represent null strings. + */ + public CsvReader setNullStringValue(final String nullValue) { + sentinelConfiguration.nullStringValue = new MutableObject<>(nullValue); + return this; + } + + /** + * Specify a sentinel value used to represent null DateTimes. + */ + public CsvReader setNullDateTimeAsLongValue(final long nullValue) { + sentinelConfiguration.nullDateTimeAsLongValue = nullValue; + return this; + } + + /** + * Specify a sentinel value used to represent null Timestamps + */ + public CsvReader setNullTimestampAsLongValue(final long nullValue) { + sentinelConfiguration.nullTimestampAsLongValue = nullValue; + return this; + } + + /** + * Result of {@link #read}. + */ + public static final class Result { + private final long numRows; + private final String[] columnNames; + private final Sink[] columns; + + public Result(final long numRows, final String[] columnNames, final Sink[] columns) { + this.numRows = numRows; + this.columnNames = columnNames; + this.columns = columns; + } + + /** + * Number of rows in the input. + */ + public long numRows() { + return numRows; + } + + /** + * The column names. + */ + public String[] columnNames() { + return columnNames; + } + + /** + * Data for each column. Each Sink was constructed by some method in the SinkFactory that the caller passed to + * {@link #read}. + */ + public Sink[] columns() { + return columns; + } + + /** + * The number of columns. + */ + public int numCols() { + return columns.length; + } + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/reading/ParseDenseStorageToColumn.java b/extensions/csv/src/main/java/io/deephaven/csv/reading/ParseDenseStorageToColumn.java new file mode 100644 index 00000000000..688d811b1c9 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/reading/ParseDenseStorageToColumn.java @@ -0,0 +1,159 @@ +package io.deephaven.csv.reading; + +import io.deephaven.csv.parsers.IteratorHolder; +import io.deephaven.csv.parsers.Parser; +import io.deephaven.csv.parsers.Parsers; +import io.deephaven.csv.parsers.context.SentinelConfiguration; +import io.deephaven.csv.densestorage.DenseStorageReader; +import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.sinks.SinkFactory; +import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.util.CsvReaderException; +import io.deephaven.csv.util.Renderer; +import io.deephaven.csv.parsers.context.ParseContext; + +import java.util.*; +import java.util.stream.Collectors; + +public final class ParseDenseStorageToColumn { + /** + * @param dsr A reader for the input. + * @param dsrAlt A second reader for the same input (used to perform the second pass over the data). + * @param parsers The set of parsers to try. + * @param nullValueLiteral The cell text that represents the null value. Typically configured to be the empty + * string. + * @param nullParser The Parser to use if the column contains all nulls. + * @param sentinelConfiguration The configuration for optional sentinel values. + * @param sinkFactory Factory that makes all of the Sinks of various types, used to consume the data we produce. + * @return The {@link Sink}, provided by the caller's {@link SinkFactory}, that was selected to hold the column + * data. + */ + public static Sink doit(final DenseStorageReader dsr, final DenseStorageReader dsrAlt, + final List> parsers, final Parser nullParser, + final Tokenizer.CustomTimeZoneParser customTimeZoneParser, + final SentinelConfiguration sentinelConfiguration, final String nullValueLiteral, + final SinkFactory sinkFactory) throws CsvReaderException { + List> parsersToTry; + if (parsers == null) { + parsersToTry = Parsers.DEFAULT; + } else { + // The user has specified the parsers to use, which will be some mix of standard parsers + // and user-defined parsers. 'parsersToTry' will be a reordering of parsers such that: + // 1. All the specified standard parsers come before any custom parser. + // 2. The standard parsers will be sorted by the order they appear in Parsers.PRECEDENCE. + // 3. The custom parsers will be in the same order as provided by the user. + final Set> requestedSet = new HashSet<>(parsers); + parsersToTry = Parsers.PRECEDENCE.stream().filter(requestedSet::contains).collect(Collectors.toList()); + + final Set> standardSet = new HashSet<>(Parsers.PRECEDENCE); + for (Parser parser : parsers) { + if (!standardSet.contains(parser)) { + parsersToTry.add(parser); + } + } + } + + // This is a hack that lets the float parser know whether there is another parser + // available (namely, the double parser) that can handle more significant digits + // than it can. + final boolean doubleParserIsAvailable = parsersToTry.contains(Parsers.DOUBLE); + + final Tokenizer tokenizer = new Tokenizer(customTimeZoneParser); + final ParseContext ctx = new ParseContext(tokenizer, sinkFactory, + sentinelConfiguration, nullValueLiteral, + doubleParserIsAvailable); + + // Skip null cells. Nulls are supported but they cannot help us with type inference. + final IteratorHolder ih = new IteratorHolder(dsr); + boolean columnIsAllNulls = true; + while (ih.tryMoveNext()) { + if (!ctx.isNullCell(ih)) { + columnIsAllNulls = false; + break; + } + } + + if (columnIsAllNulls && parsersToTry.size() != 1) { + if (nullParser == null) { + throw new CsvReaderException( + "Column contains all null cells: can't infer type of column, and nullParser is not set."); + } + parsersToTry = List.of(nullParser); + } + + if (parsersToTry.size() == 0) { + throw new CsvReaderException("No parsers available to try."); + } + + final int numParsers = parsersToTry.size(); + for (int ii = 0; ii < numParsers; ++ii) { + final Parser parser = parsersToTry.get(ii); + final boolean lastResort = ii == numParsers - 1; + final Sink result = tryTwoPhaseParse(parser, lastResort, ctx, ih, dsrAlt); + if (result != null) { + return result; + } + } + + // Can't get here because tryTwoPhaseParser would have been called with lastResort = true, + // and the exception would have been thrown there. + throw new CsvReaderException("Logic error: unreachable code."); + } + + private static Sink tryTwoPhaseParse(final Parser parser, final boolean lastResort, + final ParseContext ctx, final IteratorHolder ih, + final DenseStorageReader dsrAlt) throws CsvReaderException { + // Before invoking us, our caller has advanced 'ih' past all null entries. + if (ih.isExhausted()) { + // The input contains all nulls (or is empty). Make a new IteratorHolder for the second pass + // of null parsing. + final IteratorHolder ihAlt = new IteratorHolder(dsrAlt); + ihAlt.tryMoveNext(); // Input could be empty, so success or failure is ignored here. + final Sink result = parser.tryParse(ctx, ihAlt, null, 0, ih.numConsumed()); + if (result == null) { + final String message = + "Parser failed on all-null column. Parser was: " + parser.getClass().getCanonicalName(); + throw new CsvReaderException(message); + } + return result; + } + + // If this is the last (or only) parser, then there's no need to do two phases. We can potentially + // be more memory efficient by forgetting the first iteratation and just doing the second. + // The reason we bother coding up this special case is that we are chasing the writer, + // and the sooner we get rid of values at the head of the linked list, the better off we are, + // in terms of memory consumption. + if (lastResort) { + final IteratorHolder ihAlt = new IteratorHolder(dsrAlt); + ihAlt.tryMoveNext(); // Input could be empty, so success or failure is ignored here. + final Sink result = parser.tryParse(ctx, ihAlt, null, 0, Long.MAX_VALUE); + if (result == null) { + final String message = + "Parse failed on last available parser. Parser was: " + parser.getClass().getCanonicalName(); + throw new CsvReaderException(message); + } + return result; + } + + // Otherwise do a two phase parse. If successful, the first phase will process the elements + // [startRow, end). + final long startRow = ih.numConsumed() - 1; + Sink result = parser.tryParse(ctx, ih, null, startRow, Long.MAX_VALUE); + if (result == null) { + // This parser failed, but there are more. Signal failure to the caller who will try the next one. + return null; + } + + // Parse succeeded to the end. Now do phase 2, with a new iterator, and passing in the same Sink + // that was created in phase 1. This phase will process the elements [0, startRow). + final IteratorHolder ihAlt = new IteratorHolder(dsrAlt); + ihAlt.tryMoveNext(); // Input is not empty, so we know this will succeed. + result = parser.tryParse(ctx, ihAlt, result, 0, startRow); + if (result == null) { + final String message = "Parse failed on second pass through the input. Parser was: " + + parser.getClass().getCanonicalName(); + throw new CsvReaderException(message); + } + return result; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/reading/ParseInputToDenseStorage.java b/extensions/csv/src/main/java/io/deephaven/csv/reading/ParseInputToDenseStorage.java new file mode 100644 index 00000000000..81889e6ab40 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/reading/ParseInputToDenseStorage.java @@ -0,0 +1,86 @@ +package io.deephaven.csv.reading; + +import io.deephaven.csv.densestorage.DenseStorageWriter; +import io.deephaven.csv.containers.CharSlice; +import io.deephaven.csv.util.CsvReaderException; +import org.apache.commons.lang3.mutable.MutableBoolean; + +/** + * The job of this class is to take the input text, parse the CSV format to break it into cells, and to feed those cells + * into N DenseStorageWriters (where N is the number of columns). Concurrent readers will be taking the data out of + * DenseStorageReaders, possibly doing type inference on it, and then parsing the data into the final form. + */ +public class ParseInputToDenseStorage { + public static long doit(String[] optionalFirstDataRow, CellGrabber grabber, DenseStorageWriter[] dsws) + throws CsvReaderException { + final CharSlice slice = new CharSlice(); + final int numCols = dsws.length; + + // Zero-based row number. + long rowNum = 0; + // There is a case (namely when the file has no headers and the client hasn't specified + // them either) where the CsvReader was forced to read the first row of data from the file + // in order to determine the number of columns. If this happened, optionalFirstRow will + // be non-null and we can process it as data here. Then the rest of the processing can + // proceed as normal. + if (optionalFirstDataRow != null) { + if (optionalFirstDataRow.length != numCols) { + throw new CsvReaderException(String.format("Expected %d columns but optionalFirstRow had %d", + numCols, optionalFirstDataRow.length)); + } + for (int ii = 0; ii < optionalFirstDataRow.length; ++ii) { + final char[] temp = optionalFirstDataRow[ii].toCharArray(); + slice.reset(temp, 0, temp.length); + dsws[ii].append(slice); + } + ++rowNum; + } + + // Grab the remaining lines and store them. + // The outer while is the "row" iteration. + final MutableBoolean lastInRow = new MutableBoolean(); + OUTER: while (true) { + // Zero-based column number. + int colNum = 0; + + try { + // The inner while is the "column" iteration + while (true) { + if (!grabber.grabNext(slice, lastInRow)) { + if (colNum == 0) { + break OUTER; + } + // Can't get here. If there is any data at all in the last row, and *then* the file ends, + // grabNext() will return true, with lastInRow set. + throw new CsvReaderException("Logic error: uncaught short last row"); + } + final DenseStorageWriter dsw = dsws[colNum]; + dsw.append(slice); + ++colNum; + if (colNum == numCols) { + if (!lastInRow.booleanValue()) { + throw new CsvReaderException( + String.format("Row %d has too many columns (expected %d)", rowNum + 1, numCols)); + } + break; + } + if (lastInRow.booleanValue()) { + throw new CsvReaderException( + String.format("Row %d has too few columns (expected %d, have %d)", rowNum + 1, numCols, + colNum)); + } + } + } catch (Exception e) { + final String message = String.format("While processing row %d, column %d:", + rowNum + 1, colNum + 1); + throw new CsvReaderException(message, e); + } + ++rowNum; + } + for (DenseStorageWriter dsw : dsws) { + dsw.finish(); + } + + return rowNum; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/sinks/Sink.java b/extensions/csv/src/main/java/io/deephaven/csv/sinks/Sink.java new file mode 100644 index 00000000000..6bc7a33d96a --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/sinks/Sink.java @@ -0,0 +1,5 @@ +package io.deephaven.csv.sinks; + +public interface Sink { + void write(final TARRAY src, final int srcOffset, final long destOffset, final int size, boolean appending); +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/sinks/SinkFactory.java b/extensions/csv/src/main/java/io/deephaven/csv/sinks/SinkFactory.java new file mode 100644 index 00000000000..65b9c390021 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/sinks/SinkFactory.java @@ -0,0 +1,66 @@ +package io.deephaven.csv.sinks; + +import io.deephaven.csv.CsvSpecs; + +/** + * An interface which allows the CsvReader to write to opaque data types that it is unaware of. Using this interface, + * the caller provides factory methods that make a Sink<T> for the corresponding CSV type. Note that, due to the + * inference process, the CsvReader might make a Sink and then abandon it if it no longer suits the needs of the + * CsvReader. Put another way, the final set of Sinks returned might be a subset of the set of Sinks created. + */ +public interface SinkFactory { + /** + * Make a Sink for the boolean (as byte) representation. + */ + Sink makeBooleanAsByteSink(); + + /** + * Make a Sink for the byte representation. + */ + Sink makeByteSink(); + + /** + * Make a Sink for the short representation. + */ + Sink makeShortSink(); + + /** + * Make a Sink for the int representation. + */ + Sink makeIntSink(); + + /** + * Make a Sink for the long representation. + */ + Sink makeLongSink(); + + /** + * Make a Sink for the float representation. + */ + Sink makeFloatSink(); + + /** + * Make a Sink for the double representation. + */ + Sink makeDoubleSink(); + + /** + * Make a Sink for the char representation. + */ + Sink makeCharSink(); + + /** + * Make a Sink for the String representation. + */ + Sink makeStringSink(); + + /** + * Make a Sink for the DateTime (as long) representation. + */ + Sink makeDateTimeAsLongSink(); + + /** + * Make a Sink for the Timestamp (as long) representation. + */ + Sink makeTimestampAsLongSink(); +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/tokenization/RangeTests.java b/extensions/csv/src/main/java/io/deephaven/csv/tokenization/RangeTests.java new file mode 100644 index 00000000000..0000f7febc9 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/tokenization/RangeTests.java @@ -0,0 +1,159 @@ +package io.deephaven.csv.tokenization; + +/** + * Simple range tests that may be faster than the corresponding Java utilities because they are ASCII-specific. + */ +public class RangeTests { + /** + * If the character is lowercase ASCII, converts it to uppercase ASCII. Otherwise leaves it alone. + * + * @param ch The character. + * @return The converted or unchanged character. + */ + public static char toUpper(char ch) { + return isLower(ch) ? (char) (ch - 'a' + 'A') : ch; + } + + /** + * Is the character uppercase ASCII? + * + * @param ch The character. + * @return True if the character is uppercase ASCII. False otherwise. + */ + public static boolean isUpper(char ch) { + return ch >= 'A' && ch <= 'Z'; + } + + /** + * Is the character lowercase ASCII? + * + * @param ch The character. + * @return True if the character is lowercase ASCII. False otherwise. + */ + public static boolean isLower(char ch) { + return ch >= 'a' && ch <= 'z'; + } + + /** + * Is the character an ASCII digit? + * + * @param ch The character. + * @return True if the character is an ASCII digit. False otherwise. + */ + public static boolean isDigit(char ch) { + return ch >= '0' && ch <= '9'; + } + + /** + * Is the character ASCII whitespace? + * + * @param ch The character. + * @return True if the character is ASCII whitespace. False otherwise. + */ + public static boolean isWhitespace(char ch) { + // TODO(kosak) + return ch == ' '; + } + + /** + * Is the value in range for a Java byte? + * + * @param value The value. + * @return True if the value is in range for a Java byte. False otherwise. + */ + public static boolean isInRangeForByte(long value) { + return value >= Byte.MIN_VALUE && value <= Byte.MAX_VALUE; + } + + /** + * Is the value in range for a Java short? + * + * @param value The value. + * @return True if the value is in range for a Java short. False otherwise. + */ + public static boolean isInRangeForShort(long value) { + return value >= Short.MIN_VALUE && value <= Short.MAX_VALUE; + } + + /** + * Is the value in range for a Java int? + * + * @param value The value. + * @return True if the value is in range for a Java int. False otherwise. + */ + public static boolean isInRangeForInt(long value) { + return value >= Integer.MIN_VALUE && value <= Integer.MAX_VALUE; + } + + /** + * Is the value in range for a Java float? + * + * @param value The value. + * @return True if the value is in range for a Java float. False otherwise. + */ + public static boolean isInRangeForFloat(double value) { + return Double.isNaN(value) || + Double.isInfinite(value) || + (value >= -Float.MAX_VALUE && value <= Float.MAX_VALUE); + } + + /** + * Determines if the input has fewer than 8 significant figures. Used to estimate whether the number can fit in a + * float without losing precision. + * + * @param data The character data. + * @param begin The inclusive start of the slice. + * @param end The exclusive end of the slice. + * @return True if the input has fewer than 8 significant figures. Otherwsie, false. Can degenerately return true if + * the input is empty or does not contain digits at all. + */ + public static boolean hasFewerThan8SignificantFigures(final byte[] data, final int begin, final int end) { + int current = begin; + + // Find first digit + while (true) { + if (current == end) { + return true; + } + final char ch = (char) data[current++]; + if (isDigit(ch)) { + break; + } + } + + // Find last digit. Intervening decimal point is ok. + final int firstDigitPos = current; + int decimalPointAdjustment = 0; + while (current != end) { + final char ch = (char) data[current++]; + if (ch == '.') { + decimalPointAdjustment = 1; + continue; + } + if (!isDigit(ch)) { + break; + } + } + + final int numDigits = current - firstDigitPos - decimalPointAdjustment; + return numDigits < 8; + } + + /** + * Are all the characters in the array slice representable as bytes? + * + * @param data The character data. + * @param begin The inclusive start of the slice. + * @param end The exclusive end of the slice. + * @return True if all the characters are representable as bytes, false otherwise. + */ + public static boolean isByteRepresentable(final char[] data, final int begin, final int end) { + for (int cur = begin; cur != end; ++cur) { + final int ch = data[cur]; + if (ch > 0xff) { + return false; + } + } + return true; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/tokenization/Tokenizer.java b/extensions/csv/src/main/java/io/deephaven/csv/tokenization/Tokenizer.java new file mode 100644 index 00000000000..64291878577 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/tokenization/Tokenizer.java @@ -0,0 +1,580 @@ +package io.deephaven.csv.tokenization; + +import io.deephaven.csv.containers.ByteSlice; +import io.deephaven.csv.tokenization.external.FastDoubleParser; +import io.deephaven.csv.tokenization.external.FastDoubleParserFromByteArray; +import org.apache.commons.lang3.mutable.MutableBoolean; +import org.apache.commons.lang3.mutable.MutableDouble; +import org.apache.commons.lang3.mutable.MutableLong; +import org.apache.commons.lang3.mutable.MutableObject; + +import java.time.*; + +public class Tokenizer { + /** + * An optional custom time zone parser. Used for clients (such as Deephaven itself) who support custom time zone + * formats. + */ + private final CustomTimeZoneParser customTimeZoneParser; + /** + * Storage for a temporary "out" variable owned by tryParseDateTime. + */ + private final MutableLong dateTimeTemp0 = new MutableLong(); + /** + * Storage for a temporary "out" variable owned by tryParseDateTime. + */ + private final MutableLong dateTimeTemp1 = new MutableLong(); + /** + * Storage for a temporary "out" variable owned by tryParseDateTime. + */ + private final MutableLong dateTimeTemp2 = new MutableLong(); + /** + * Storage for a temporary "out" variable owned by tryParseDateTime. + */ + private final MutableObject dateTimeTempZoneId = new MutableObject<>(); + /** + * Storage for a temporary "out" variable owned by tryParseDateTime. + */ + private final MutableBoolean dateTimeTempBoolean = new MutableBoolean(); + + public Tokenizer(CustomTimeZoneParser customTimeZoneParser) { + this.customTimeZoneParser = customTimeZoneParser; + } + + /** + * Try to parse the input as a boolean. + * + * @param bs The input text. This slice is *NOT* modified, regardless of success or failure. + * @param result Contains the parsed value if this method returns true. Otherwise, the contents are unspecified. + * @return true if the input was successfully parsed. Otherwise, false. + */ + public boolean tryParseBoolean(final ByteSlice bs, final MutableBoolean result) { + final int savedBegin = bs.begin(); + final int savedEnd = bs.end(); + Mutating.trim(bs); + // Successful if parse was successful AND input was completely consumed. + final boolean success = Mutating.tryParseBoolean(bs, result) && bs.begin() == bs.end(); + bs.setBegin(savedBegin); + bs.setEnd(savedEnd); + return success; + } + + /** + * Try to parse the input as a long. + * + * @param bs The input text. This slice is *NOT* modified, regardless of success or failure. + * @param result Contains the parsed value if this method returns true. Otherwise, the contents are unspecified. + * @return true if the input was successfully parsed. Otherwise, false. + */ + public boolean tryParseLong(final ByteSlice bs, final MutableLong result) { + final int savedBegin = bs.begin(); + final int savedEnd = bs.end(); + Mutating.trim(bs); + // Successful if parse was successful AND input was completely consumed. + final boolean success = Mutating.tryParseLong(bs, result) && bs.begin() == bs.end(); + bs.setBegin(savedBegin); + bs.setEnd(savedEnd); + return success; + } + + /** + * Try to parse the input as a double. + * + * @param bs The input text. This slice is *NOT* modified, regardless of success or failure. + * @param result Contains the parsed value if this method returns true. Otherwise, the contents are unspecified. + * @return true if {@code bs} was successfully parsed as a double. Otherwise, false. + */ + public boolean tryParseDouble(final ByteSlice bs, final MutableDouble result) { + // Our third-party double parser already checks for trailing garbage so we don't have to. + try { + final double res = FastDoubleParserFromByteArray.parseDouble(bs.data(), bs.begin(), bs.size()); + result.setValue(res); + return true; + } catch (NumberFormatException nfe) { + // Normally we would be pretty sad about throwing exceptions in the inner loops of our CSV parsing + // framework, + // but the fact of the matter is that the first exception thrown will cause the FloatParser and DoubleParser + // to punt to the next parser anyway, so the overall impact is negligible. + return false; + } + } + + /** + * Try to parse the input as a Deephaven DateTime value (represented as nanoseconds since the epoch). + * + * @param bs The input text. This slice is *NOT* modified, regardless of success or failure. + * @param result Contains the parsed value if this method returns true. Otherwise, the contents are unspecified. + * @return true if {@code bs} was successfully parsed as a Deephaven DateTime. Otherwise, false. + */ + public boolean tryParseDateTime(final ByteSlice bs, final MutableLong result) { + final int savedBegin = bs.begin(); + // Successful if parse was successful AND input was completely consumed. + final boolean success = Mutating.tryParseDateTime(bs, customTimeZoneParser, + dateTimeTemp0, dateTimeTemp1, dateTimeTemp2, dateTimeTempBoolean, dateTimeTempZoneId, result) && + bs.begin() == bs.end(); + bs.setBegin(savedBegin); + return success; + } + + /** + * The methods in this class obey the following invariants: On success, they update their incoming ByteSlice (to + * point to the end of the sequence) On failure, they leave it unchanged. + */ + private static final class Mutating { + /** + * Modify the input slice to remove leading and trailing whitespace, if any. + * + * @param bs Modified in place to remove leading and trailing whitespace, if any. + */ + public static void trim(final ByteSlice bs) { + while (bs.begin() != bs.end() && RangeTests.isWhitespace((char) bs.front())) { + bs.setBegin(bs.begin() + 1); + } + while (bs.begin() != bs.end() && RangeTests.isWhitespace((char) bs.back())) { + bs.setEnd(bs.end() - 1); + } + } + + /** + * If the slice is nonempty and its first character is {@code ch}, then eat the first character. + * + * @param bs If the method returns true, the slice is updated to remove the first character. Otherwise the slice + * is unmodified. + * @return true If the character was eaten, false otherwise. + */ + private static boolean tryEatChar(final ByteSlice bs, final char ch) { + if (bs.begin() == bs.end() || bs.front() != ch) { + return false; + } + bs.setBegin(bs.begin() + 1); + return true; + } + + /** + * Parse (a prefix of) the input as a boolean. + * + * @param bs If the method returns true, the slice is updated to remove the characters comprising the result. + * Otherwise, the slice is unmodified. + * @param result Contains the parsed value if this method returns true. Otherwise, the contents are unspecified. + * @return true if the input was successfully parsed. Otherwise, false. + */ + public static boolean tryParseBoolean(final ByteSlice bs, final MutableBoolean result) { + final byte[] d = bs.data(); + final int o = bs.begin(); + final int bSize = bs.size(); + + if (bSize == 4) { + result.setValue(true); // Optimistic. + bs.setBegin(bs.end()); // Optimistic + return (d[o] == 't' || d[o] == 'T') && + (d[o + 1] == 'r' || d[o + 1] == 'R') && + (d[o + 2] == 'u' || d[o + 2] == 'U') && + (d[o + 3] == 'e' || d[o + 3] == 'E'); + } + + if (bSize == 5) { + result.setValue(false); // Optimistic. + bs.setBegin(bs.end()); // Optimistic + // mega sad + return (d[o] == 'f' || d[o] == 'F') && + (d[o + 1] == 'a' || d[o + 1] == 'A') && + (d[o + 2] == 'l' || d[o + 2] == 'L') && + (d[o + 3] == 's' || d[o + 3] == 'S') && + (d[o + 4] == 'e' || d[o + 4] == 'E'); + } + + return false; + } + + /** + * Parse (a prefix of) the input as a long. + * + * @param bs If the method returns true, the slice is updated to remove the characters comprising the result. + * Otherwise, the slice is unmodified. + * @param result Contains the parsed value if this method returns true. Otherwise, the contents are unspecified. + * @return true if the input was successfully parsed. Otherwise, false. + */ + public static boolean tryParseLong(final ByteSlice bs, final MutableLong result) { + final int savedBegin = bs.begin(); + if (bs.begin() == bs.end()) { + return false; + } + final char front = (char) bs.front(); + boolean negative = false; + if (front == '+') { + bs.setBegin(bs.begin() + 1); + } else if (front == '-') { + negative = true; + bs.setBegin(bs.begin() + 1); + } + if (!tryParseWholeNumber(bs, 1, 999, negative, result)) { + bs.setBegin(savedBegin); + return false; + } + return true; + } + + /** + * Parse (a prefix of) the input as a DateTime. Formats are largely ISO except we allow a pluggable timezone + * parser, used for example to support Deephaven-style time zones. + *

    + * Allowable formats: + *

    + *

      + *
    • 2021-11-07T09:00:00Z
    • + *
    • 2021-11-07T09:00:00.1Z
    • + *
    • 2021-11-07T09:00:00.12Z
    • + *
    • ...
    • + *
    • 2021-11-07T09:00:00.123456789Z
    • + *
    + * + *

    + * Hyphens and colons are optional (all in or all out). The 'T' can also be a space. The Z above is either the + * literal Z meaning UTC or some other text. If this character is not Z some other character here, the method + * will call out to a user-configurable time zone parser (if one is configured) to see if the text can be parsed + * as a time zone. In Deephaven this is used to parse Deephaven time zones like " NY", " MN", " ET", " UTC" etc. + * + *

    + * Allowable formats in UTC offset style (can be + or -): + *

    + * The offset can be hh or hh:mm or hhmm. + *

      + *
    • 2021-11-07T09:00:00+01
    • + *
    • 2021-11-07T09:00:00.1-02:30
    • + *
    • 2021-11-07T09:00:00.12+0300
    • + *
    • ...
    • + *
    • 2021-11-07T09:00:00.123456789+01:30
    • + *
    + * + * @param bs The input text. If the method returns true, the slice will be advanced past the parsed text. + * Otherwise (if the method returns false), the slice will be unchanged. + * @param temp0 A MutableLong for the method to use for temporary storage, so it doesn't have to allocate one. + * @param temp1 A MutableLong for the method to use for temporary storage, so it doesn't have to allocate one. + * @param temp2 A MutableLong for the method to use for temporary storage, so it doesn't have to allocate one. + * @param tempZoneId A MutableObject<ZoneId> for the method to use for temporary storage, so it doesn't + * have to allocate one. + * @param result The DateTime (in nanoseconds since the epoch) if the method returns true. Otherwise, the + * contents are unspecified. + * @return true if the input was successfully parsed. Otherwise, false. + */ + private static boolean tryParseDateTime(final ByteSlice bs, final CustomTimeZoneParser customTimeZoneParser, + final MutableLong temp0, final MutableLong temp1, final MutableLong temp2, + final MutableBoolean tempBoolean, final MutableObject tempZoneId, + final MutableLong result) { + // The logic proceeds as follows. + // First we have the required fields: + // yyyy + // - (optional, but if absent then no later hyphens or colons) + // mm + // : (optional, but presence or absence of punctuation needs to be consistent. Also we can stop here). + // dd + // T or space (or we stop here) + // hh + // : (optional, but presence or absence of punctuation needs to be consistent. Also we can stop here). + // MM + // : (optional, but presence or absence of punctuation needs to be consistent. Also we can stop here). + // SS + // . or , (optional, introduces fraction, must be followed by 1-9 decimal digits). + // Z or space or + or - : + // Z means UTC + // space introduces a Deephaven time zone indicator, which must be one of the + // enumeration values in the class DBTimeZone + // + or - means an offset follows, which itself is + // hh + // : (optional) + // mm (optional) + + final int savedBegin = bs.begin(); + if (!tryParseYyyymmdd(bs, temp0, temp1, temp2, tempBoolean)) { + return false; + } + final int year = temp0.intValue(); + final int month = temp1.intValue(); + final int day = temp2.intValue(); + final boolean punctuationRequired = tempBoolean.booleanValue(); + + // Require 'T' or ' ' (per RFC 3339). + if (!tryEatChar(bs, 'T') && !tryEatChar(bs, ' ')) { + bs.setBegin(savedBegin); + return false; + } + + // Reusing result for temporary storage! + if (!tryParseHHmmssNanos(bs, punctuationRequired, temp0, temp1, temp2, result)) { + bs.setBegin(savedBegin); + return false; + } + final int hour = temp0.intValue(); + final int minute = temp1.intValue(); + final int second = temp2.intValue(); + final int nanos = result.intValue(); + + if (!tryParseIsoTimeZone(bs, tempZoneId, temp0) && + (customTimeZoneParser == null || !customTimeZoneParser.tryParse(bs, tempZoneId, temp0))) { + bs.setBegin(savedBegin); + return false; + } + final ZoneId zoneIdToUse = tempZoneId.getValue(); + final long secondsOffsetToUse = temp0.getValue(); + + final ZonedDateTime zdt = ZonedDateTime.of(year, month, day, hour, minute, second, 0, zoneIdToUse); + final long zdtSeconds = zdt.toEpochSecond(); + final long adjustedZdtSeconds = zdtSeconds + secondsOffsetToUse; + final long adjustedZdtNanos = adjustedZdtSeconds * 1_000_000_000L + nanos; + result.setValue(adjustedZdtNanos); + return true; + } + + /** + * Parse (a prefix of) the input as yyyyMMdd or yyyy-MM-dd. + * + * @param bs The input text. If the method returns true, the slice will be advanced past the parsed text. + * Otherwise (if the method returns false), the slice will be unchanged. + * @param yyyy Contains the parsed year if this method returns true. Otherwise, the contents are unspecified. + * @param mm Contains the parsed month if this method returns true. Otherwise, the contents are unspecified. + * @param dd Contains the parsed day if this method returns true. Otherwise, the contents are unspecified. + * @param hasPunctuation Contains whether hyphens were found in the input if this method returns true. + * Otherwise, the contents are unspecified. + * @return true if the input was successfully parsed. Otherwise, false. + */ + private static boolean tryParseYyyymmdd(final ByteSlice bs, final MutableLong yyyy, + final MutableLong mm, final MutableLong dd, final MutableBoolean hasPunctuation) { + final int savedBegin = bs.begin(); + if (!tryParseWholeNumber(bs, 4, 4, false, yyyy)) { + return false; + } + + hasPunctuation.setValue(Mutating.tryEatChar(bs, '-')); + + if (!tryParseWholeNumber(bs, 2, 2, false, mm)) { + bs.setBegin(savedBegin); + return false; + } + + if (hasPunctuation.booleanValue() && !tryEatChar(bs, '-')) { + bs.setBegin(savedBegin); + return false; + } + if (!tryParseWholeNumber(bs, 2, 2, false, dd)) { + bs.setBegin(savedBegin); + return false; + } + return true; + } + + /** + * Parse (a prefix of) the input as hhmmss.nnnnnn or hh:mm:ss.nnnn and various variants (minutes, seconds, and + * nanos are optional, and the nanos separator is either period or comma). + * + * @param bs The input text. If the method returns true, the slice will be advanced past the parsed text. + * Otherwise (if the method returns false), the slice will be unchanged. + * @param hours Contains the parsed hours if this method returns true. Otherwise, the contents are unspecified. + * @param minutes Contains the parsed minutes if this method returns true. Otherwise, the contents are + * unspecified. + * @param seconds Contains the parsed seconds if this method returns true. Otherwise, the contents are + * unspecified. + * @param nanos Contains the parsed nanos if this method returns true. Otherwise, the contents are unspecified. + * @return true if the input was successfully parsed. Otherwise, false. + */ + private static boolean tryParseHHmmssNanos(final ByteSlice bs, final boolean punctuationRequired, + final MutableLong hours, + final MutableLong minutes, + final MutableLong seconds, final MutableLong nanos) { + final int savedBegin = bs.begin(); + // Hour + if (!tryParseWholeNumber(bs, 2, 2, false, hours)) { + return false; + } + // Set defaults for minutes, seconds, nanos, in case we exit early. + minutes.setValue(0); + seconds.setValue(0); + nanos.setValue(0); + + // Minutes, seconds, and nanos are optional. + + // If a colon is required but not present, then the parse is done (this is not an error). + if (punctuationRequired && !tryEatChar(bs, ':')) { + return true; + } + + // Try minutes + if (!tryParseWholeNumber(bs, 2, 2, false, minutes)) { + // Next thing is not a number. If we previously ingested a colon, not having a next number is an error. + // But if we did not ingest a colon, not having a number is ok. + // If we return false we are obligated to reset the slice. + minutes.setValue(0); // Sub-parse failed, but we still might return success. So this needs to be + // correct. + final boolean success = !punctuationRequired; + if (!success) { + bs.setBegin(savedBegin); + } + return success; + } + + // If a colon is required but not present, then the parse is done (this is not an error). + if (punctuationRequired && !tryEatChar(bs, ':')) { + return true; + } + + // Try seconds. + if (!tryParseWholeNumber(bs, 2, 2, false, seconds)) { + // Next thing is not a number. If we previously ingested a colon, not having a next number is an error. + // But if we did not ingest a colon, not having a number is ok. + // If we return false we are obligated to reset the slice. + seconds.setValue(0); // Sub-parse failed, but we still might return success. So this needs to be + // correct. + final boolean success = !punctuationRequired; + if (!success) { + bs.setBegin(savedBegin); + } + return success; + } + + if (!tryEatChar(bs, '.') && !tryEatChar(bs, ',')) { + // Period (or comma!) introduces fraction. If not present, then stop the parse here (with a success + // indication) + return true; + } + + // Try nanoseconds + final int beginBeforeNs = bs.begin(); + if (!tryParseWholeNumber(bs, 1, 9, false, nanos)) { + // If you couldn't get a number, that's a parse fail. + bs.setBegin(savedBegin); + return false; + } + + // Pad to the right with zeroes (that is, in "blah.12", the .12 is 120,000,000 nanos. + final int length = bs.begin() - beginBeforeNs; + for (int ii = length; ii < 9; ++ii) { + nanos.setValue(10 * nanos.getValue()); + } + return true; + } + + /** + * Try to parse (a prefix of) the input as a whole number. + * + * @param bs The input text. If the method returns true, the slice will be advanced past the parsed text. + * Otherwise (if the method returns false), the slice will be unchanged. + * @param minSize The parsed number must be at least this size. Otherwise, we will return false. + * @param maxSize The parsed number must be at most this size. We will stop the parse after this size, even if + * the parse could continue (e.g. even if a digit immediately follows). + * @param negate If we should negate the parsed number on the way out. + * @param result Contains the parsed whole number if this method returns true. Otherwise, the contents are + * unspecified. + * @return true if the input was successfully parsed. Otherwise, false. + */ + private static boolean tryParseWholeNumber(final ByteSlice bs, final int minSize, final int maxSize, + final boolean negate, final MutableLong result) { + final byte[] data = bs.data(); + final int begin = bs.begin(); + final int end = bs.end(); + final int size = bs.size(); + if (size < minSize) { + return false; + } + final int endToUse = Math.min(end, begin + maxSize); + long res = 0; + long prevRes = 0; + int current = begin; + // We build the number using negative values, because the negative range is slightly longer and this helps + // us when we happen to parse Long.MIN_VALUE. + for (; current < endToUse; ++current) { + final char ch = (char) data[current]; + if (!RangeTests.isDigit(ch)) { + break; + } + res = res * 10 - (ch - '0'); + if (res > prevRes) { + // Overflow. + return false; + } + prevRes = res; + } + if (current == begin) { + return false; + } + // Caller wanted a positive number, but we operate in a negative number system + if (!negate) { + if (res == Long.MIN_VALUE) { + // Can't represent the negation of Long.MIN_VALUE. + return false; + } + res = -res; + } + result.setValue(res); + bs.setBegin(current); + return true; + } + + /** + * Try to parse (a prefix of) the input as an ISO time zone. For convenience/efficiency, the method is allowed + * to return either a ZoneOffset or a numerical offset in seconds (or both). + * + * @param bs The input text. If the method returns true, the slice will be advanced past the parsed text. + * Otherwise (if the method returns false), the slice will be unchanged. + * @param zoneId Contains the parsed time zone if this method returns true. Otherwise, the contents are + * unspecified. + * @param offsetSeconds Contains a time zone offset in seconds if this method returns true. Otherwise, the + * contents are unspecified. + * @return true if the input was successfully parsed. Otherwise, false. + */ + private static boolean tryParseIsoTimeZone(final ByteSlice bs, final MutableObject zoneId, + final MutableLong offsetSeconds) { + if (bs.size() == 0) { + return false; + } + + final char front = (char) bs.front(); + if (front == 'Z') { + zoneId.setValue(ZoneOffset.UTC); + offsetSeconds.setValue(0); + bs.setBegin(bs.begin() + 1); + return true; + } + + // Try an offset like +02 or +03:30 or -0400 + if (front != '+' && front != '-') { + return false; + } + final boolean negative = front == '-'; + + final int savedBegin = bs.begin(); + bs.setBegin(bs.begin() + 1); + + // Reuse offsetSeconds as temp variable + if (!tryParseWholeNumber(bs, 2, 2, false, offsetSeconds)) { + bs.setBegin(savedBegin); + return false; + } + final long hours = offsetSeconds.longValue(); + + // Optional colon + tryEatChar(bs, ':'); + + long minutes = 0; + if (bs.size() != 0) { + // Reuse offsetSeconds as temp variable + if (!tryParseWholeNumber(bs, 2, 2, false, offsetSeconds)) { + bs.setBegin(savedBegin); + return false; + } + minutes = offsetSeconds.longValue(); + } + zoneId.setValue(ZoneOffset.UTC); + + // If someone says yyyy-MM-DDThh:mm:ss-05 + // The "-05" means this is meant to be interpreted as UTC-5. + // If I parse yyyy-MM-DDThh:mm:ss in UTC (without any offset), it will be 5 hours later than + // what the user intended. So in other words, I need to negate the -05 + final long offset = ((hours * 60) + minutes) * 60; + offsetSeconds.setValue(negative ? offset : -offset); + return true; + } + } + + public interface CustomTimeZoneParser { + boolean tryParse(final ByteSlice bs, final MutableObject zoneId, final MutableLong offsetSeconds); + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/tokenization/external/FastDoubleMath.java b/extensions/csv/src/main/java/io/deephaven/csv/tokenization/external/FastDoubleMath.java new file mode 100644 index 00000000000..fc3fdf4e93f --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/tokenization/external/FastDoubleMath.java @@ -0,0 +1,1066 @@ +/* + * @(#)FastDoubleMath.java Copyright © 2021. Werner Randelshofer, Switzerland. MIT License. + */ + +/* + * TEMPORARY Modifications by kosak: change package name FOR NOW (until we correctly include this jar in our build) + */ + + +package io.deephaven.csv.tokenization.external; + +import java.util.Objects; + +/** + * This class provides the mathematical functions needed by {@link FastDoubleParser}. + *

    + * This is a C++ to Java port of Daniel Lemire's fast_double_parser. + *

    + * The code contains enhancements from Daniel Lemire's fast_float_parser, so that it can parse double Strings with very + * long sequences of numbers + *

    + * References: + *

    + *
    Daniel Lemire, fast_double_parser, 4x faster than strtod. Apache License 2.0 or Boost Software License.
    + *
    github.com
    + * + *
    Daniel Lemire, fast_float number parsing library: 4x faster than strtod. Apache License 2.0.
    + *
    github.com
    + * + *
    Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51 (8), 2021. + * arXiv.2101.11408v3 [cs.DS] 24 Feb 2021
    + *
    arxiv.org
    + *
    + *

    + */ +class FastDoubleMath { + /** + * The smallest non-zero float (binary64) is 2^−1074. We take as input numbers of the form w x 10^q where w < 2^64. + * We have that {@literal w * 10^-343 < 2^(64-344) 5^-343 < 2^-1076}. + *

    + * However, we have that {@literal (2^64-1) * 10^-342 = (2^64-1) * 2^-342 * 5^-342 > 2^−1074}. Thus it is possible + * for a number of the form w * 10^-342 where w is a 64-bit value to be a non-zero floating-point number. + *

    + * ******** + *

    + * If we are solely interested in the *normal* numbers then the smallest value is 2^-1022. We can generate a value + * larger than 2^-1022 with expressions of the form w * 10^-326. Thus we need to pick FASTFLOAT_SMALLEST_POWER >= + * -326. + *

    + * ******** + *

    + * Any number of form w * 10^309 where w>= 1 is going to be infinite in binary64 so we never need to worry about + * powers of 5 greater than 308. + */ + private final static int FASTFLOAT_DEC_SMALLEST_POWER = -325; + private final static int FASTFLOAT_DEC_LARGEST_POWER = 308; + private final static int FASTFLOAT_HEX_SMALLEST_POWER = Double.MIN_EXPONENT; + private final static int FASTFLOAT_HEX_LARGEST_POWER = Double.MAX_EXPONENT; + /** + * Precomputed powers of ten from 10^0 to 10^22. These can be represented exactly using the double type. + */ + private static final double[] powerOfTen = { + 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, + 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22}; + /** + * When mapping numbers from decimal to binary, we go from w * 10^q to m * 2^p but we have 10^q = 5^q * 2^q, so + * effectively we are trying to match w * 2^q * 5^q to m * 2^p. Thus the powers of two are not a concern since they + * can be represented exactly using the binary notation, only the powers of five affect the binary significand. + *

    + *

    + * The mantissas of powers of ten from -308 to 308, extended out to sixty four bits. The array contains the powers + * of ten approximated as a 64-bit mantissa. It goes from 10^FASTFLOAT_SMALLEST_POWER to 10^FASTFLOAT_LARGEST_POWER + * (inclusively). The mantissa is truncated, and never rounded up. Uses about 5KB. + */ + private static final long[] MANTISSA_64 = { + 0xa5ced43b7e3e9188L, 0xcf42894a5dce35eaL, + 0x818995ce7aa0e1b2L, 0xa1ebfb4219491a1fL, + 0xca66fa129f9b60a6L, 0xfd00b897478238d0L, + 0x9e20735e8cb16382L, 0xc5a890362fddbc62L, + 0xf712b443bbd52b7bL, 0x9a6bb0aa55653b2dL, + 0xc1069cd4eabe89f8L, 0xf148440a256e2c76L, + 0x96cd2a865764dbcaL, 0xbc807527ed3e12bcL, + 0xeba09271e88d976bL, 0x93445b8731587ea3L, + 0xb8157268fdae9e4cL, 0xe61acf033d1a45dfL, + 0x8fd0c16206306babL, 0xb3c4f1ba87bc8696L, + 0xe0b62e2929aba83cL, 0x8c71dcd9ba0b4925L, + 0xaf8e5410288e1b6fL, 0xdb71e91432b1a24aL, + 0x892731ac9faf056eL, 0xab70fe17c79ac6caL, + 0xd64d3d9db981787dL, 0x85f0468293f0eb4eL, + 0xa76c582338ed2621L, 0xd1476e2c07286faaL, + 0x82cca4db847945caL, 0xa37fce126597973cL, + 0xcc5fc196fefd7d0cL, 0xff77b1fcbebcdc4fL, + 0x9faacf3df73609b1L, 0xc795830d75038c1dL, + 0xf97ae3d0d2446f25L, 0x9becce62836ac577L, + 0xc2e801fb244576d5L, 0xf3a20279ed56d48aL, + 0x9845418c345644d6L, 0xbe5691ef416bd60cL, + 0xedec366b11c6cb8fL, 0x94b3a202eb1c3f39L, + 0xb9e08a83a5e34f07L, 0xe858ad248f5c22c9L, + 0x91376c36d99995beL, 0xb58547448ffffb2dL, + 0xe2e69915b3fff9f9L, 0x8dd01fad907ffc3bL, + 0xb1442798f49ffb4aL, 0xdd95317f31c7fa1dL, + 0x8a7d3eef7f1cfc52L, 0xad1c8eab5ee43b66L, + 0xd863b256369d4a40L, 0x873e4f75e2224e68L, + 0xa90de3535aaae202L, 0xd3515c2831559a83L, + 0x8412d9991ed58091L, 0xa5178fff668ae0b6L, + 0xce5d73ff402d98e3L, 0x80fa687f881c7f8eL, + 0xa139029f6a239f72L, 0xc987434744ac874eL, + 0xfbe9141915d7a922L, 0x9d71ac8fada6c9b5L, + 0xc4ce17b399107c22L, 0xf6019da07f549b2bL, + 0x99c102844f94e0fbL, 0xc0314325637a1939L, + 0xf03d93eebc589f88L, 0x96267c7535b763b5L, + 0xbbb01b9283253ca2L, 0xea9c227723ee8bcbL, + 0x92a1958a7675175fL, 0xb749faed14125d36L, + 0xe51c79a85916f484L, 0x8f31cc0937ae58d2L, + 0xb2fe3f0b8599ef07L, 0xdfbdcece67006ac9L, + 0x8bd6a141006042bdL, 0xaecc49914078536dL, + 0xda7f5bf590966848L, 0x888f99797a5e012dL, + 0xaab37fd7d8f58178L, 0xd5605fcdcf32e1d6L, + 0x855c3be0a17fcd26L, 0xa6b34ad8c9dfc06fL, + 0xd0601d8efc57b08bL, 0x823c12795db6ce57L, + 0xa2cb1717b52481edL, 0xcb7ddcdda26da268L, + 0xfe5d54150b090b02L, 0x9efa548d26e5a6e1L, + 0xc6b8e9b0709f109aL, 0xf867241c8cc6d4c0L, + 0x9b407691d7fc44f8L, 0xc21094364dfb5636L, + 0xf294b943e17a2bc4L, 0x979cf3ca6cec5b5aL, + 0xbd8430bd08277231L, 0xece53cec4a314ebdL, + 0x940f4613ae5ed136L, 0xb913179899f68584L, + 0xe757dd7ec07426e5L, 0x9096ea6f3848984fL, + 0xb4bca50b065abe63L, 0xe1ebce4dc7f16dfbL, + 0x8d3360f09cf6e4bdL, 0xb080392cc4349decL, + 0xdca04777f541c567L, 0x89e42caaf9491b60L, + 0xac5d37d5b79b6239L, 0xd77485cb25823ac7L, + 0x86a8d39ef77164bcL, 0xa8530886b54dbdebL, + 0xd267caa862a12d66L, 0x8380dea93da4bc60L, + 0xa46116538d0deb78L, 0xcd795be870516656L, + 0x806bd9714632dff6L, 0xa086cfcd97bf97f3L, + 0xc8a883c0fdaf7df0L, 0xfad2a4b13d1b5d6cL, + 0x9cc3a6eec6311a63L, 0xc3f490aa77bd60fcL, + 0xf4f1b4d515acb93bL, 0x991711052d8bf3c5L, + 0xbf5cd54678eef0b6L, 0xef340a98172aace4L, + 0x9580869f0e7aac0eL, 0xbae0a846d2195712L, + 0xe998d258869facd7L, 0x91ff83775423cc06L, + 0xb67f6455292cbf08L, 0xe41f3d6a7377eecaL, + 0x8e938662882af53eL, 0xb23867fb2a35b28dL, + 0xdec681f9f4c31f31L, 0x8b3c113c38f9f37eL, + 0xae0b158b4738705eL, 0xd98ddaee19068c76L, + 0x87f8a8d4cfa417c9L, 0xa9f6d30a038d1dbcL, + 0xd47487cc8470652bL, 0x84c8d4dfd2c63f3bL, + 0xa5fb0a17c777cf09L, 0xcf79cc9db955c2ccL, + 0x81ac1fe293d599bfL, 0xa21727db38cb002fL, + 0xca9cf1d206fdc03bL, 0xfd442e4688bd304aL, + 0x9e4a9cec15763e2eL, 0xc5dd44271ad3cdbaL, + 0xf7549530e188c128L, 0x9a94dd3e8cf578b9L, + 0xc13a148e3032d6e7L, 0xf18899b1bc3f8ca1L, + 0x96f5600f15a7b7e5L, 0xbcb2b812db11a5deL, + 0xebdf661791d60f56L, 0x936b9fcebb25c995L, + 0xb84687c269ef3bfbL, 0xe65829b3046b0afaL, + 0x8ff71a0fe2c2e6dcL, 0xb3f4e093db73a093L, + 0xe0f218b8d25088b8L, 0x8c974f7383725573L, + 0xafbd2350644eeacfL, 0xdbac6c247d62a583L, + 0x894bc396ce5da772L, 0xab9eb47c81f5114fL, + 0xd686619ba27255a2L, 0x8613fd0145877585L, + 0xa798fc4196e952e7L, 0xd17f3b51fca3a7a0L, + 0x82ef85133de648c4L, 0xa3ab66580d5fdaf5L, + 0xcc963fee10b7d1b3L, 0xffbbcfe994e5c61fL, + 0x9fd561f1fd0f9bd3L, 0xc7caba6e7c5382c8L, + 0xf9bd690a1b68637bL, 0x9c1661a651213e2dL, + 0xc31bfa0fe5698db8L, 0xf3e2f893dec3f126L, + 0x986ddb5c6b3a76b7L, 0xbe89523386091465L, + 0xee2ba6c0678b597fL, 0x94db483840b717efL, + 0xba121a4650e4ddebL, 0xe896a0d7e51e1566L, + 0x915e2486ef32cd60L, 0xb5b5ada8aaff80b8L, + 0xe3231912d5bf60e6L, 0x8df5efabc5979c8fL, + 0xb1736b96b6fd83b3L, 0xddd0467c64bce4a0L, + 0x8aa22c0dbef60ee4L, 0xad4ab7112eb3929dL, + 0xd89d64d57a607744L, 0x87625f056c7c4a8bL, + 0xa93af6c6c79b5d2dL, 0xd389b47879823479L, + 0x843610cb4bf160cbL, 0xa54394fe1eedb8feL, + 0xce947a3da6a9273eL, 0x811ccc668829b887L, + 0xa163ff802a3426a8L, 0xc9bcff6034c13052L, + 0xfc2c3f3841f17c67L, 0x9d9ba7832936edc0L, + 0xc5029163f384a931L, 0xf64335bcf065d37dL, + 0x99ea0196163fa42eL, 0xc06481fb9bcf8d39L, + 0xf07da27a82c37088L, 0x964e858c91ba2655L, + 0xbbe226efb628afeaL, 0xeadab0aba3b2dbe5L, + 0x92c8ae6b464fc96fL, 0xb77ada0617e3bbcbL, + 0xe55990879ddcaabdL, 0x8f57fa54c2a9eab6L, + 0xb32df8e9f3546564L, 0xdff9772470297ebdL, + 0x8bfbea76c619ef36L, 0xaefae51477a06b03L, + 0xdab99e59958885c4L, 0x88b402f7fd75539bL, + 0xaae103b5fcd2a881L, 0xd59944a37c0752a2L, + 0x857fcae62d8493a5L, 0xa6dfbd9fb8e5b88eL, + 0xd097ad07a71f26b2L, 0x825ecc24c873782fL, + 0xa2f67f2dfa90563bL, 0xcbb41ef979346bcaL, + 0xfea126b7d78186bcL, 0x9f24b832e6b0f436L, + 0xc6ede63fa05d3143L, 0xf8a95fcf88747d94L, + 0x9b69dbe1b548ce7cL, 0xc24452da229b021bL, + 0xf2d56790ab41c2a2L, 0x97c560ba6b0919a5L, + 0xbdb6b8e905cb600fL, 0xed246723473e3813L, + 0x9436c0760c86e30bL, 0xb94470938fa89bceL, + 0xe7958cb87392c2c2L, 0x90bd77f3483bb9b9L, + 0xb4ecd5f01a4aa828L, 0xe2280b6c20dd5232L, + 0x8d590723948a535fL, 0xb0af48ec79ace837L, + 0xdcdb1b2798182244L, 0x8a08f0f8bf0f156bL, + 0xac8b2d36eed2dac5L, 0xd7adf884aa879177L, + 0x86ccbb52ea94baeaL, 0xa87fea27a539e9a5L, + 0xd29fe4b18e88640eL, 0x83a3eeeef9153e89L, + 0xa48ceaaab75a8e2bL, 0xcdb02555653131b6L, + 0x808e17555f3ebf11L, 0xa0b19d2ab70e6ed6L, + 0xc8de047564d20a8bL, 0xfb158592be068d2eL, + 0x9ced737bb6c4183dL, 0xc428d05aa4751e4cL, + 0xf53304714d9265dfL, 0x993fe2c6d07b7fabL, + 0xbf8fdb78849a5f96L, 0xef73d256a5c0f77cL, + 0x95a8637627989aadL, 0xbb127c53b17ec159L, + 0xe9d71b689dde71afL, 0x9226712162ab070dL, + 0xb6b00d69bb55c8d1L, 0xe45c10c42a2b3b05L, + 0x8eb98a7a9a5b04e3L, 0xb267ed1940f1c61cL, + 0xdf01e85f912e37a3L, 0x8b61313bbabce2c6L, + 0xae397d8aa96c1b77L, 0xd9c7dced53c72255L, + 0x881cea14545c7575L, 0xaa242499697392d2L, + 0xd4ad2dbfc3d07787L, 0x84ec3c97da624ab4L, + 0xa6274bbdd0fadd61L, 0xcfb11ead453994baL, + 0x81ceb32c4b43fcf4L, 0xa2425ff75e14fc31L, + 0xcad2f7f5359a3b3eL, 0xfd87b5f28300ca0dL, + 0x9e74d1b791e07e48L, 0xc612062576589ddaL, + 0xf79687aed3eec551L, 0x9abe14cd44753b52L, + 0xc16d9a0095928a27L, 0xf1c90080baf72cb1L, + 0x971da05074da7beeL, 0xbce5086492111aeaL, + 0xec1e4a7db69561a5L, 0x9392ee8e921d5d07L, + 0xb877aa3236a4b449L, 0xe69594bec44de15bL, + 0x901d7cf73ab0acd9L, 0xb424dc35095cd80fL, + 0xe12e13424bb40e13L, 0x8cbccc096f5088cbL, + 0xafebff0bcb24aafeL, 0xdbe6fecebdedd5beL, + 0x89705f4136b4a597L, 0xabcc77118461cefcL, + 0xd6bf94d5e57a42bcL, 0x8637bd05af6c69b5L, + 0xa7c5ac471b478423L, 0xd1b71758e219652bL, + 0x83126e978d4fdf3bL, 0xa3d70a3d70a3d70aL, + 0xccccccccccccccccL, 0x8000000000000000L, + 0xa000000000000000L, 0xc800000000000000L, + 0xfa00000000000000L, 0x9c40000000000000L, + 0xc350000000000000L, 0xf424000000000000L, + 0x9896800000000000L, 0xbebc200000000000L, + 0xee6b280000000000L, 0x9502f90000000000L, + 0xba43b74000000000L, 0xe8d4a51000000000L, + 0x9184e72a00000000L, 0xb5e620f480000000L, + 0xe35fa931a0000000L, 0x8e1bc9bf04000000L, + 0xb1a2bc2ec5000000L, 0xde0b6b3a76400000L, + 0x8ac7230489e80000L, 0xad78ebc5ac620000L, + 0xd8d726b7177a8000L, 0x878678326eac9000L, + 0xa968163f0a57b400L, 0xd3c21bcecceda100L, + 0x84595161401484a0L, 0xa56fa5b99019a5c8L, + 0xcecb8f27f4200f3aL, 0x813f3978f8940984L, + 0xa18f07d736b90be5L, 0xc9f2c9cd04674edeL, + 0xfc6f7c4045812296L, 0x9dc5ada82b70b59dL, + 0xc5371912364ce305L, 0xf684df56c3e01bc6L, + 0x9a130b963a6c115cL, 0xc097ce7bc90715b3L, + 0xf0bdc21abb48db20L, 0x96769950b50d88f4L, + 0xbc143fa4e250eb31L, 0xeb194f8e1ae525fdL, + 0x92efd1b8d0cf37beL, 0xb7abc627050305adL, + 0xe596b7b0c643c719L, 0x8f7e32ce7bea5c6fL, + 0xb35dbf821ae4f38bL, 0xe0352f62a19e306eL, + 0x8c213d9da502de45L, 0xaf298d050e4395d6L, + 0xdaf3f04651d47b4cL, 0x88d8762bf324cd0fL, + 0xab0e93b6efee0053L, 0xd5d238a4abe98068L, + 0x85a36366eb71f041L, 0xa70c3c40a64e6c51L, + 0xd0cf4b50cfe20765L, 0x82818f1281ed449fL, + 0xa321f2d7226895c7L, 0xcbea6f8ceb02bb39L, + 0xfee50b7025c36a08L, 0x9f4f2726179a2245L, + 0xc722f0ef9d80aad6L, 0xf8ebad2b84e0d58bL, + 0x9b934c3b330c8577L, 0xc2781f49ffcfa6d5L, + 0xf316271c7fc3908aL, 0x97edd871cfda3a56L, + 0xbde94e8e43d0c8ecL, 0xed63a231d4c4fb27L, + 0x945e455f24fb1cf8L, 0xb975d6b6ee39e436L, + 0xe7d34c64a9c85d44L, 0x90e40fbeea1d3a4aL, + 0xb51d13aea4a488ddL, 0xe264589a4dcdab14L, + 0x8d7eb76070a08aecL, 0xb0de65388cc8ada8L, + 0xdd15fe86affad912L, 0x8a2dbf142dfcc7abL, + 0xacb92ed9397bf996L, 0xd7e77a8f87daf7fbL, + 0x86f0ac99b4e8dafdL, 0xa8acd7c0222311bcL, + 0xd2d80db02aabd62bL, 0x83c7088e1aab65dbL, + 0xa4b8cab1a1563f52L, 0xcde6fd5e09abcf26L, + 0x80b05e5ac60b6178L, 0xa0dc75f1778e39d6L, + 0xc913936dd571c84cL, 0xfb5878494ace3a5fL, + 0x9d174b2dcec0e47bL, 0xc45d1df942711d9aL, + 0xf5746577930d6500L, 0x9968bf6abbe85f20L, + 0xbfc2ef456ae276e8L, 0xefb3ab16c59b14a2L, + 0x95d04aee3b80ece5L, 0xbb445da9ca61281fL, + 0xea1575143cf97226L, 0x924d692ca61be758L, + 0xb6e0c377cfa2e12eL, 0xe498f455c38b997aL, + 0x8edf98b59a373fecL, 0xb2977ee300c50fe7L, + 0xdf3d5e9bc0f653e1L, 0x8b865b215899f46cL, + 0xae67f1e9aec07187L, 0xda01ee641a708de9L, + 0x884134fe908658b2L, 0xaa51823e34a7eedeL, + 0xd4e5e2cdc1d1ea96L, 0x850fadc09923329eL, + 0xa6539930bf6bff45L, 0xcfe87f7cef46ff16L, + 0x81f14fae158c5f6eL, 0xa26da3999aef7749L, + 0xcb090c8001ab551cL, 0xfdcb4fa002162a63L, + 0x9e9f11c4014dda7eL, 0xc646d63501a1511dL, + 0xf7d88bc24209a565L, 0x9ae757596946075fL, + 0xc1a12d2fc3978937L, 0xf209787bb47d6b84L, + 0x9745eb4d50ce6332L, 0xbd176620a501fbffL, + 0xec5d3fa8ce427affL, 0x93ba47c980e98cdfL, + 0xb8a8d9bbe123f017L, 0xe6d3102ad96cec1dL, + 0x9043ea1ac7e41392L, 0xb454e4a179dd1877L, + 0xe16a1dc9d8545e94L, 0x8ce2529e2734bb1dL, + 0xb01ae745b101e9e4L, 0xdc21a1171d42645dL, + 0x899504ae72497ebaL, 0xabfa45da0edbde69L, + 0xd6f8d7509292d603L, 0x865b86925b9bc5c2L, + 0xa7f26836f282b732L, 0xd1ef0244af2364ffL, + 0x8335616aed761f1fL, 0xa402b9c5a8d3a6e7L, + 0xcd036837130890a1L, 0x802221226be55a64L, + 0xa02aa96b06deb0fdL, 0xc83553c5c8965d3dL, + 0xfa42a8b73abbf48cL, 0x9c69a97284b578d7L, + 0xc38413cf25e2d70dL, 0xf46518c2ef5b8cd1L, + 0x98bf2f79d5993802L, 0xbeeefb584aff8603L, + 0xeeaaba2e5dbf6784L, 0x952ab45cfa97a0b2L, + 0xba756174393d88dfL, 0xe912b9d1478ceb17L, + 0x91abb422ccb812eeL, 0xb616a12b7fe617aaL, + 0xe39c49765fdf9d94L, 0x8e41ade9fbebc27dL, + 0xb1d219647ae6b31cL, 0xde469fbd99a05fe3L, + 0x8aec23d680043beeL, 0xada72ccc20054ae9L, + 0xd910f7ff28069da4L, 0x87aa9aff79042286L, + 0xa99541bf57452b28L, 0xd3fa922f2d1675f2L, + 0x847c9b5d7c2e09b7L, 0xa59bc234db398c25L, + 0xcf02b2c21207ef2eL, 0x8161afb94b44f57dL, + 0xa1ba1ba79e1632dcL, 0xca28a291859bbf93L, + 0xfcb2cb35e702af78L, 0x9defbf01b061adabL, + 0xc56baec21c7a1916L, 0xf6c69a72a3989f5bL, + 0x9a3c2087a63f6399L, 0xc0cb28a98fcf3c7fL, + 0xf0fdf2d3f3c30b9fL, 0x969eb7c47859e743L, + 0xbc4665b596706114L, 0xeb57ff22fc0c7959L, + 0x9316ff75dd87cbd8L, 0xb7dcbf5354e9beceL, + 0xe5d3ef282a242e81L, 0x8fa475791a569d10L, + 0xb38d92d760ec4455L, 0xe070f78d3927556aL, + 0x8c469ab843b89562L, 0xaf58416654a6babbL, + 0xdb2e51bfe9d0696aL, 0x88fcf317f22241e2L, + 0xab3c2fddeeaad25aL, 0xd60b3bd56a5586f1L, + 0x85c7056562757456L, 0xa738c6bebb12d16cL, + 0xd106f86e69d785c7L, 0x82a45b450226b39cL, + 0xa34d721642b06084L, 0xcc20ce9bd35c78a5L, + 0xff290242c83396ceL, 0x9f79a169bd203e41L, + 0xc75809c42c684dd1L, 0xf92e0c3537826145L, + 0x9bbcc7a142b17ccbL, 0xc2abf989935ddbfeL, + 0xf356f7ebf83552feL, 0x98165af37b2153deL, + 0xbe1bf1b059e9a8d6L, 0xeda2ee1c7064130cL, + 0x9485d4d1c63e8be7L, 0xb9a74a0637ce2ee1L, + 0xe8111c87c5c1ba99L, 0x910ab1d4db9914a0L, + 0xb54d5e4a127f59c8L, 0xe2a0b5dc971f303aL, + 0x8da471a9de737e24L, 0xb10d8e1456105dadL, + 0xdd50f1996b947518L, 0x8a5296ffe33cc92fL, + 0xace73cbfdc0bfb7bL, 0xd8210befd30efa5aL, + 0x8714a775e3e95c78L, 0xa8d9d1535ce3b396L, + 0xd31045a8341ca07cL, 0x83ea2b892091e44dL, + 0xa4e4b66b68b65d60L, 0xce1de40642e3f4b9L, + 0x80d2ae83e9ce78f3L, 0xa1075a24e4421730L, + 0xc94930ae1d529cfcL, 0xfb9b7cd9a4a7443cL, + 0x9d412e0806e88aa5L, 0xc491798a08a2ad4eL, + 0xf5b5d7ec8acb58a2L, 0x9991a6f3d6bf1765L, + 0xbff610b0cc6edd3fL, 0xeff394dcff8a948eL, + 0x95f83d0a1fb69cd9L, 0xbb764c4ca7a4440fL, + 0xea53df5fd18d5513L, 0x92746b9be2f8552cL, + 0xb7118682dbb66a77L, 0xe4d5e82392a40515L, + 0x8f05b1163ba6832dL, 0xb2c71d5bca9023f8L, + 0xdf78e4b2bd342cf6L, 0x8bab8eefb6409c1aL, + 0xae9672aba3d0c320L, 0xda3c0f568cc4f3e8L, + 0x8865899617fb1871L, 0xaa7eebfb9df9de8dL, + 0xd51ea6fa85785631L, 0x8533285c936b35deL, + 0xa67ff273b8460356L, 0xd01fef10a657842cL, + 0x8213f56a67f6b29bL, 0xa298f2c501f45f42L, + 0xcb3f2f7642717713L, 0xfe0efb53d30dd4d7L, + 0x9ec95d1463e8a506L, 0xc67bb4597ce2ce48L, + 0xf81aa16fdc1b81daL, 0x9b10a4e5e9913128L, + 0xc1d4ce1f63f57d72L, 0xf24a01a73cf2dccfL, + 0x976e41088617ca01L, 0xbd49d14aa79dbc82L, + 0xec9c459d51852ba2L, 0x93e1ab8252f33b45L, + 0xb8da1662e7b00a17L, 0xe7109bfba19c0c9dL, + 0x906a617d450187e2L, 0xb484f9dc9641e9daL, + 0xe1a63853bbd26451L, 0x8d07e33455637eb2L, + 0xb049dc016abc5e5fL, 0xdc5c5301c56b75f7L, + 0x89b9b3e11b6329baL, 0xac2820d9623bf429L, + 0xd732290fbacaf133L, 0x867f59a9d4bed6c0L, + 0xa81f301449ee8c70L, 0xd226fc195c6a2f8cL, + 0x83585d8fd9c25db7L, 0xa42e74f3d032f525L, + 0xcd3a1230c43fb26fL, 0x80444b5e7aa7cf85L, + 0xa0555e361951c366L, 0xc86ab5c39fa63440L, + 0xfa856334878fc150L, 0x9c935e00d4b9d8d2L, + 0xc3b8358109e84f07L, 0xf4a642e14c6262c8L, + 0x98e7e9cccfbd7dbdL, 0xbf21e44003acdd2cL, + 0xeeea5d5004981478L, 0x95527a5202df0ccbL, + 0xbaa718e68396cffdL, 0xe950df20247c83fdL, + 0x91d28b7416cdd27eL, 0xb6472e511c81471dL, + 0xe3d8f9e563a198e5L, 0x8e679c2f5e44ff8fL}; + /** + * A complement to mantissa_64 complete to a 128-bit mantissa. Uses about 5KB but is rarely accessed. + */ + private final static long[] MANTISSA_128 = { + 0x419ea3bd35385e2dL, 0x52064cac828675b9L, + 0x7343efebd1940993L, 0x1014ebe6c5f90bf8L, + 0xd41a26e077774ef6L, 0x8920b098955522b4L, + 0x55b46e5f5d5535b0L, 0xeb2189f734aa831dL, + 0xa5e9ec7501d523e4L, 0x47b233c92125366eL, + 0x999ec0bb696e840aL, 0xc00670ea43ca250dL, + 0x380406926a5e5728L, 0xc605083704f5ecf2L, + 0xf7864a44c633682eL, 0x7ab3ee6afbe0211dL, + 0x5960ea05bad82964L, 0x6fb92487298e33bdL, + 0xa5d3b6d479f8e056L, 0x8f48a4899877186cL, + 0x331acdabfe94de87L, 0x9ff0c08b7f1d0b14L, + 0x7ecf0ae5ee44dd9L, 0xc9e82cd9f69d6150L, + 0xbe311c083a225cd2L, 0x6dbd630a48aaf406L, + 0x92cbbccdad5b108L, 0x25bbf56008c58ea5L, + 0xaf2af2b80af6f24eL, 0x1af5af660db4aee1L, + 0x50d98d9fc890ed4dL, 0xe50ff107bab528a0L, + 0x1e53ed49a96272c8L, 0x25e8e89c13bb0f7aL, + 0x77b191618c54e9acL, 0xd59df5b9ef6a2417L, + 0x4b0573286b44ad1dL, 0x4ee367f9430aec32L, + 0x229c41f793cda73fL, 0x6b43527578c1110fL, + 0x830a13896b78aaa9L, 0x23cc986bc656d553L, + 0x2cbfbe86b7ec8aa8L, 0x7bf7d71432f3d6a9L, + 0xdaf5ccd93fb0cc53L, 0xd1b3400f8f9cff68L, + 0x23100809b9c21fa1L, 0xabd40a0c2832a78aL, + 0x16c90c8f323f516cL, 0xae3da7d97f6792e3L, + 0x99cd11cfdf41779cL, 0x40405643d711d583L, + 0x482835ea666b2572L, 0xda3243650005eecfL, + 0x90bed43e40076a82L, 0x5a7744a6e804a291L, + 0x711515d0a205cb36L, 0xd5a5b44ca873e03L, + 0xe858790afe9486c2L, 0x626e974dbe39a872L, + 0xfb0a3d212dc8128fL, 0x7ce66634bc9d0b99L, + 0x1c1fffc1ebc44e80L, 0xa327ffb266b56220L, + 0x4bf1ff9f0062baa8L, 0x6f773fc3603db4a9L, + 0xcb550fb4384d21d3L, 0x7e2a53a146606a48L, + 0x2eda7444cbfc426dL, 0xfa911155fefb5308L, + 0x793555ab7eba27caL, 0x4bc1558b2f3458deL, + 0x9eb1aaedfb016f16L, 0x465e15a979c1cadcL, + 0xbfacd89ec191ec9L, 0xcef980ec671f667bL, + 0x82b7e12780e7401aL, 0xd1b2ecb8b0908810L, + 0x861fa7e6dcb4aa15L, 0x67a791e093e1d49aL, + 0xe0c8bb2c5c6d24e0L, 0x58fae9f773886e18L, + 0xaf39a475506a899eL, 0x6d8406c952429603L, + 0xc8e5087ba6d33b83L, 0xfb1e4a9a90880a64L, + 0x5cf2eea09a55067fL, 0xf42faa48c0ea481eL, + 0xf13b94daf124da26L, 0x76c53d08d6b70858L, + 0x54768c4b0c64ca6eL, 0xa9942f5dcf7dfd09L, + 0xd3f93b35435d7c4cL, 0xc47bc5014a1a6dafL, + 0x359ab6419ca1091bL, 0xc30163d203c94b62L, + 0x79e0de63425dcf1dL, 0x985915fc12f542e4L, + 0x3e6f5b7b17b2939dL, 0xa705992ceecf9c42L, + 0x50c6ff782a838353L, 0xa4f8bf5635246428L, + 0x871b7795e136be99L, 0x28e2557b59846e3fL, + 0x331aeada2fe589cfL, 0x3ff0d2c85def7621L, + 0xfed077a756b53a9L, 0xd3e8495912c62894L, + 0x64712dd7abbbd95cL, 0xbd8d794d96aacfb3L, + 0xecf0d7a0fc5583a0L, 0xf41686c49db57244L, + 0x311c2875c522ced5L, 0x7d633293366b828bL, + 0xae5dff9c02033197L, 0xd9f57f830283fdfcL, + 0xd072df63c324fd7bL, 0x4247cb9e59f71e6dL, + 0x52d9be85f074e608L, 0x67902e276c921f8bL, + 0xba1cd8a3db53b6L, 0x80e8a40eccd228a4L, + 0x6122cd128006b2cdL, 0x796b805720085f81L, + 0xcbe3303674053bb0L, 0xbedbfc4411068a9cL, + 0xee92fb5515482d44L, 0x751bdd152d4d1c4aL, + 0xd262d45a78a0635dL, 0x86fb897116c87c34L, + 0xd45d35e6ae3d4da0L, 0x8974836059cca109L, + 0x2bd1a438703fc94bL, 0x7b6306a34627ddcfL, + 0x1a3bc84c17b1d542L, 0x20caba5f1d9e4a93L, + 0x547eb47b7282ee9cL, 0xe99e619a4f23aa43L, + 0x6405fa00e2ec94d4L, 0xde83bc408dd3dd04L, + 0x9624ab50b148d445L, 0x3badd624dd9b0957L, + 0xe54ca5d70a80e5d6L, 0x5e9fcf4ccd211f4cL, + 0x7647c3200069671fL, 0x29ecd9f40041e073L, + 0xf468107100525890L, 0x7182148d4066eeb4L, + 0xc6f14cd848405530L, 0xb8ada00e5a506a7cL, + 0xa6d90811f0e4851cL, 0x908f4a166d1da663L, + 0x9a598e4e043287feL, 0x40eff1e1853f29fdL, + 0xd12bee59e68ef47cL, 0x82bb74f8301958ceL, + 0xe36a52363c1faf01L, 0xdc44e6c3cb279ac1L, + 0x29ab103a5ef8c0b9L, 0x7415d448f6b6f0e7L, + 0x111b495b3464ad21L, 0xcab10dd900beec34L, + 0x3d5d514f40eea742L, 0xcb4a5a3112a5112L, + 0x47f0e785eaba72abL, 0x59ed216765690f56L, + 0x306869c13ec3532cL, 0x1e414218c73a13fbL, + 0xe5d1929ef90898faL, 0xdf45f746b74abf39L, + 0x6b8bba8c328eb783L, 0x66ea92f3f326564L, + 0xc80a537b0efefebdL, 0xbd06742ce95f5f36L, + 0x2c48113823b73704L, 0xf75a15862ca504c5L, + 0x9a984d73dbe722fbL, 0xc13e60d0d2e0ebbaL, + 0x318df905079926a8L, 0xfdf17746497f7052L, + 0xfeb6ea8bedefa633L, 0xfe64a52ee96b8fc0L, + 0x3dfdce7aa3c673b0L, 0x6bea10ca65c084eL, + 0x486e494fcff30a62L, 0x5a89dba3c3efccfaL, + 0xf89629465a75e01cL, 0xf6bbb397f1135823L, + 0x746aa07ded582e2cL, 0xa8c2a44eb4571cdcL, + 0x92f34d62616ce413L, 0x77b020baf9c81d17L, + 0xace1474dc1d122eL, 0xd819992132456baL, + 0x10e1fff697ed6c69L, 0xca8d3ffa1ef463c1L, + 0xbd308ff8a6b17cb2L, 0xac7cb3f6d05ddbdeL, + 0x6bcdf07a423aa96bL, 0x86c16c98d2c953c6L, + 0xe871c7bf077ba8b7L, 0x11471cd764ad4972L, + 0xd598e40d3dd89bcfL, 0x4aff1d108d4ec2c3L, + 0xcedf722a585139baL, 0xc2974eb4ee658828L, + 0x733d226229feea32L, 0x806357d5a3f525fL, + 0xca07c2dcb0cf26f7L, 0xfc89b393dd02f0b5L, + 0xbbac2078d443ace2L, 0xd54b944b84aa4c0dL, + 0xa9e795e65d4df11L, 0x4d4617b5ff4a16d5L, + 0x504bced1bf8e4e45L, 0xe45ec2862f71e1d6L, + 0x5d767327bb4e5a4cL, 0x3a6a07f8d510f86fL, + 0x890489f70a55368bL, 0x2b45ac74ccea842eL, + 0x3b0b8bc90012929dL, 0x9ce6ebb40173744L, + 0xcc420a6a101d0515L, 0x9fa946824a12232dL, + 0x47939822dc96abf9L, 0x59787e2b93bc56f7L, + 0x57eb4edb3c55b65aL, 0xede622920b6b23f1L, + 0xe95fab368e45ecedL, 0x11dbcb0218ebb414L, + 0xd652bdc29f26a119L, 0x4be76d3346f0495fL, + 0x6f70a4400c562ddbL, 0xcb4ccd500f6bb952L, + 0x7e2000a41346a7a7L, 0x8ed400668c0c28c8L, + 0x728900802f0f32faL, 0x4f2b40a03ad2ffb9L, + 0xe2f610c84987bfa8L, 0xdd9ca7d2df4d7c9L, + 0x91503d1c79720dbbL, 0x75a44c6397ce912aL, + 0xc986afbe3ee11abaL, 0xfbe85badce996168L, + 0xfae27299423fb9c3L, 0xdccd879fc967d41aL, + 0x5400e987bbc1c920L, 0x290123e9aab23b68L, + 0xf9a0b6720aaf6521L, 0xf808e40e8d5b3e69L, + 0xb60b1d1230b20e04L, 0xb1c6f22b5e6f48c2L, + 0x1e38aeb6360b1af3L, 0x25c6da63c38de1b0L, + 0x579c487e5a38ad0eL, 0x2d835a9df0c6d851L, + 0xf8e431456cf88e65L, 0x1b8e9ecb641b58ffL, + 0xe272467e3d222f3fL, 0x5b0ed81dcc6abb0fL, + 0x98e947129fc2b4e9L, 0x3f2398d747b36224L, + 0x8eec7f0d19a03aadL, 0x1953cf68300424acL, + 0x5fa8c3423c052dd7L, 0x3792f412cb06794dL, + 0xe2bbd88bbee40bd0L, 0x5b6aceaeae9d0ec4L, + 0xf245825a5a445275L, 0xeed6e2f0f0d56712L, + 0x55464dd69685606bL, 0xaa97e14c3c26b886L, + 0xd53dd99f4b3066a8L, 0xe546a8038efe4029L, + 0xde98520472bdd033L, 0x963e66858f6d4440L, + 0xdde7001379a44aa8L, 0x5560c018580d5d52L, + 0xaab8f01e6e10b4a6L, 0xcab3961304ca70e8L, + 0x3d607b97c5fd0d22L, 0x8cb89a7db77c506aL, + 0x77f3608e92adb242L, 0x55f038b237591ed3L, + 0x6b6c46dec52f6688L, 0x2323ac4b3b3da015L, + 0xabec975e0a0d081aL, 0x96e7bd358c904a21L, + 0x7e50d64177da2e54L, 0xdde50bd1d5d0b9e9L, + 0x955e4ec64b44e864L, 0xbd5af13bef0b113eL, + 0xecb1ad8aeacdd58eL, 0x67de18eda5814af2L, + 0x80eacf948770ced7L, 0xa1258379a94d028dL, + 0x96ee45813a04330L, 0x8bca9d6e188853fcL, + 0x775ea264cf55347dL, 0x95364afe032a819dL, + 0x3a83ddbd83f52204L, 0xc4926a9672793542L, + 0x75b7053c0f178293L, 0x5324c68b12dd6338L, + 0xd3f6fc16ebca5e03L, 0x88f4bb1ca6bcf584L, + 0x2b31e9e3d06c32e5L, 0x3aff322e62439fcfL, + 0x9befeb9fad487c2L, 0x4c2ebe687989a9b3L, + 0xf9d37014bf60a10L, 0x538484c19ef38c94L, + 0x2865a5f206b06fb9L, 0xf93f87b7442e45d3L, + 0xf78f69a51539d748L, 0xb573440e5a884d1bL, + 0x31680a88f8953030L, 0xfdc20d2b36ba7c3dL, + 0x3d32907604691b4cL, 0xa63f9a49c2c1b10fL, + 0xfcf80dc33721d53L, 0xd3c36113404ea4a8L, + 0x645a1cac083126e9L, 0x3d70a3d70a3d70a3L, + 0xccccccccccccccccL, 0x0L, + 0x0L, 0x0L, + 0x0L, 0x0L, + 0x0L, 0x0L, + 0x0L, 0x0L, + 0x0L, 0x0L, + 0x0L, 0x0L, + 0x0L, 0x0L, + 0x0L, 0x0L, + 0x0L, 0x0L, + 0x0L, 0x0L, + 0x0L, 0x0L, + 0x0L, 0x0L, + 0x0L, 0x0L, + 0x0L, 0x4000000000000000L, + 0x5000000000000000L, 0xa400000000000000L, + 0x4d00000000000000L, 0xf020000000000000L, + 0x6c28000000000000L, 0xc732000000000000L, + 0x3c7f400000000000L, 0x4b9f100000000000L, + 0x1e86d40000000000L, 0x1314448000000000L, + 0x17d955a000000000L, 0x5dcfab0800000000L, + 0x5aa1cae500000000L, 0xf14a3d9e40000000L, + 0x6d9ccd05d0000000L, 0xe4820023a2000000L, + 0xdda2802c8a800000L, 0xd50b2037ad200000L, + 0x4526f422cc340000L, 0x9670b12b7f410000L, + 0x3c0cdd765f114000L, 0xa5880a69fb6ac800L, + 0x8eea0d047a457a00L, 0x72a4904598d6d880L, + 0x47a6da2b7f864750L, 0x999090b65f67d924L, + 0xfff4b4e3f741cf6dL, 0xbff8f10e7a8921a4L, + 0xaff72d52192b6a0dL, 0x9bf4f8a69f764490L, + 0x2f236d04753d5b4L, 0x1d762422c946590L, + 0x424d3ad2b7b97ef5L, 0xd2e0898765a7deb2L, + 0x63cc55f49f88eb2fL, 0x3cbf6b71c76b25fbL, + 0x8bef464e3945ef7aL, 0x97758bf0e3cbb5acL, + 0x3d52eeed1cbea317L, 0x4ca7aaa863ee4bddL, + 0x8fe8caa93e74ef6aL, 0xb3e2fd538e122b44L, + 0x60dbbca87196b616L, 0xbc8955e946fe31cdL, + 0x6babab6398bdbe41L, 0xc696963c7eed2dd1L, + 0xfc1e1de5cf543ca2L, 0x3b25a55f43294bcbL, + 0x49ef0eb713f39ebeL, 0x6e3569326c784337L, + 0x49c2c37f07965404L, 0xdc33745ec97be906L, + 0x69a028bb3ded71a3L, 0xc40832ea0d68ce0cL, + 0xf50a3fa490c30190L, 0x792667c6da79e0faL, + 0x577001b891185938L, 0xed4c0226b55e6f86L, + 0x544f8158315b05b4L, 0x696361ae3db1c721L, + 0x3bc3a19cd1e38e9L, 0x4ab48a04065c723L, + 0x62eb0d64283f9c76L, 0x3ba5d0bd324f8394L, + 0xca8f44ec7ee36479L, 0x7e998b13cf4e1ecbL, + 0x9e3fedd8c321a67eL, 0xc5cfe94ef3ea101eL, + 0xbba1f1d158724a12L, 0x2a8a6e45ae8edc97L, + 0xf52d09d71a3293bdL, 0x593c2626705f9c56L, + 0x6f8b2fb00c77836cL, 0xb6dfb9c0f956447L, + 0x4724bd4189bd5eacL, 0x58edec91ec2cb657L, + 0x2f2967b66737e3edL, 0xbd79e0d20082ee74L, + 0xecd8590680a3aa11L, 0xe80e6f4820cc9495L, + 0x3109058d147fdcddL, 0xbd4b46f0599fd415L, + 0x6c9e18ac7007c91aL, 0x3e2cf6bc604ddb0L, + 0x84db8346b786151cL, 0xe612641865679a63L, + 0x4fcb7e8f3f60c07eL, 0xe3be5e330f38f09dL, + 0x5cadf5bfd3072cc5L, 0x73d9732fc7c8f7f6L, + 0x2867e7fddcdd9afaL, 0xb281e1fd541501b8L, + 0x1f225a7ca91a4226L, 0x3375788de9b06958L, + 0x52d6b1641c83aeL, 0xc0678c5dbd23a49aL, + 0xf840b7ba963646e0L, 0xb650e5a93bc3d898L, + 0xa3e51f138ab4cebeL, 0xc66f336c36b10137L, + 0xb80b0047445d4184L, 0xa60dc059157491e5L, + 0x87c89837ad68db2fL, 0x29babe4598c311fbL, + 0xf4296dd6fef3d67aL, 0x1899e4a65f58660cL, + 0x5ec05dcff72e7f8fL, 0x76707543f4fa1f73L, + 0x6a06494a791c53a8L, 0x487db9d17636892L, + 0x45a9d2845d3c42b6L, 0xb8a2392ba45a9b2L, + 0x8e6cac7768d7141eL, 0x3207d795430cd926L, + 0x7f44e6bd49e807b8L, 0x5f16206c9c6209a6L, + 0x36dba887c37a8c0fL, 0xc2494954da2c9789L, + 0xf2db9baa10b7bd6cL, 0x6f92829494e5acc7L, + 0xcb772339ba1f17f9L, 0xff2a760414536efbL, + 0xfef5138519684abaL, 0x7eb258665fc25d69L, + 0xef2f773ffbd97a61L, 0xaafb550ffacfd8faL, + 0x95ba2a53f983cf38L, 0xdd945a747bf26183L, + 0x94f971119aeef9e4L, 0x7a37cd5601aab85dL, + 0xac62e055c10ab33aL, 0x577b986b314d6009L, + 0xed5a7e85fda0b80bL, 0x14588f13be847307L, + 0x596eb2d8ae258fc8L, 0x6fca5f8ed9aef3bbL, + 0x25de7bb9480d5854L, 0xaf561aa79a10ae6aL, + 0x1b2ba1518094da04L, 0x90fb44d2f05d0842L, + 0x353a1607ac744a53L, 0x42889b8997915ce8L, + 0x69956135febada11L, 0x43fab9837e699095L, + 0x94f967e45e03f4bbL, 0x1d1be0eebac278f5L, + 0x6462d92a69731732L, 0x7d7b8f7503cfdcfeL, + 0x5cda735244c3d43eL, 0x3a0888136afa64a7L, + 0x88aaa1845b8fdd0L, 0x8aad549e57273d45L, + 0x36ac54e2f678864bL, 0x84576a1bb416a7ddL, + 0x656d44a2a11c51d5L, 0x9f644ae5a4b1b325L, + 0x873d5d9f0dde1feeL, 0xa90cb506d155a7eaL, + 0x9a7f12442d588f2L, 0xc11ed6d538aeb2fL, + 0x8f1668c8a86da5faL, 0xf96e017d694487bcL, + 0x37c981dcc395a9acL, 0x85bbe253f47b1417L, + 0x93956d7478ccec8eL, 0x387ac8d1970027b2L, + 0x6997b05fcc0319eL, 0x441fece3bdf81f03L, + 0xd527e81cad7626c3L, 0x8a71e223d8d3b074L, + 0xf6872d5667844e49L, 0xb428f8ac016561dbL, + 0xe13336d701beba52L, 0xecc0024661173473L, + 0x27f002d7f95d0190L, 0x31ec038df7b441f4L, + 0x7e67047175a15271L, 0xf0062c6e984d386L, + 0x52c07b78a3e60868L, 0xa7709a56ccdf8a82L, + 0x88a66076400bb691L, 0x6acff893d00ea435L, + 0x583f6b8c4124d43L, 0xc3727a337a8b704aL, + 0x744f18c0592e4c5cL, 0x1162def06f79df73L, + 0x8addcb5645ac2ba8L, 0x6d953e2bd7173692L, + 0xc8fa8db6ccdd0437L, 0x1d9c9892400a22a2L, + 0x2503beb6d00cab4bL, 0x2e44ae64840fd61dL, + 0x5ceaecfed289e5d2L, 0x7425a83e872c5f47L, + 0xd12f124e28f77719L, 0x82bd6b70d99aaa6fL, + 0x636cc64d1001550bL, 0x3c47f7e05401aa4eL, + 0x65acfaec34810a71L, 0x7f1839a741a14d0dL, + 0x1ede48111209a050L, 0x934aed0aab460432L, + 0xf81da84d5617853fL, 0x36251260ab9d668eL, + 0xc1d72b7c6b426019L, 0xb24cf65b8612f81fL, + 0xdee033f26797b627L, 0x169840ef017da3b1L, + 0x8e1f289560ee864eL, 0xf1a6f2bab92a27e2L, + 0xae10af696774b1dbL, 0xacca6da1e0a8ef29L, + 0x17fd090a58d32af3L, 0xddfc4b4cef07f5b0L, + 0x4abdaf101564f98eL, 0x9d6d1ad41abe37f1L, + 0x84c86189216dc5edL, 0x32fd3cf5b4e49bb4L, + 0x3fbc8c33221dc2a1L, 0xfabaf3feaa5334aL, + 0x29cb4d87f2a7400eL, 0x743e20e9ef511012L, + 0x914da9246b255416L, 0x1ad089b6c2f7548eL, + 0xa184ac2473b529b1L, 0xc9e5d72d90a2741eL, + 0x7e2fa67c7a658892L, 0xddbb901b98feeab7L, + 0x552a74227f3ea565L, 0xd53a88958f87275fL, + 0x8a892abaf368f137L, 0x2d2b7569b0432d85L, + 0x9c3b29620e29fc73L, 0x8349f3ba91b47b8fL, + 0x241c70a936219a73L, 0xed238cd383aa0110L, + 0xf4363804324a40aaL, 0xb143c6053edcd0d5L, + 0xdd94b7868e94050aL, 0xca7cf2b4191c8326L, + 0xfd1c2f611f63a3f0L, 0xbc633b39673c8cecL, + 0xd5be0503e085d813L, 0x4b2d8644d8a74e18L, + 0xddf8e7d60ed1219eL, 0xcabb90e5c942b503L, + 0x3d6a751f3b936243L, 0xcc512670a783ad4L, + 0x27fb2b80668b24c5L, 0xb1f9f660802dedf6L, + 0x5e7873f8a0396973L, 0xdb0b487b6423e1e8L, + 0x91ce1a9a3d2cda62L, 0x7641a140cc7810fbL, + 0xa9e904c87fcb0a9dL, 0x546345fa9fbdcd44L, + 0xa97c177947ad4095L, 0x49ed8eabcccc485dL, + 0x5c68f256bfff5a74L, 0x73832eec6fff3111L, + 0xc831fd53c5ff7eabL, 0xba3e7ca8b77f5e55L, + 0x28ce1bd2e55f35ebL, 0x7980d163cf5b81b3L, + 0xd7e105bcc332621fL, 0x8dd9472bf3fefaa7L, + 0xb14f98f6f0feb951L, 0x6ed1bf9a569f33d3L, + 0xa862f80ec4700c8L, 0xcd27bb612758c0faL, + 0x8038d51cb897789cL, 0xe0470a63e6bd56c3L, + 0x1858ccfce06cac74L, 0xf37801e0c43ebc8L, + 0xd30560258f54e6baL, 0x47c6b82ef32a2069L, + 0x4cdc331d57fa5441L, 0xe0133fe4adf8e952L, + 0x58180fddd97723a6L, 0x570f09eaa7ea7648L}; + + /** + * Prevents instantiation. + */ + private FastDoubleMath() { + + } + + static double decFloatLiteralToDouble(int index, boolean isNegative, long digits, int exponent, + int virtualIndexOfPoint, long exp_number, boolean isDigitsTruncated, int skipCountInTruncatedDigits) { + if (digits == 0) { + return isNegative ? -0.0 : 0.0; + } + final double outDouble; + if (isDigitsTruncated) { + final long exponentOfTruncatedDigits = + virtualIndexOfPoint - index + skipCountInTruncatedDigits + exp_number; + + // We have too many digits. We may have to round up. + // To know whether rounding up is needed, we may have to examine up to 768 digits. + + // There are cases, in which rounding has no effect. + if (FASTFLOAT_DEC_SMALLEST_POWER <= exponentOfTruncatedDigits + && exponentOfTruncatedDigits <= FASTFLOAT_DEC_LARGEST_POWER) { + double withoutRounding = + tryDecToDoubleWithFastAlgorithm(isNegative, digits, (int) exponentOfTruncatedDigits); + double roundedUp = + tryDecToDoubleWithFastAlgorithm(isNegative, digits + 1, (int) exponentOfTruncatedDigits); + if (!Double.isNaN(withoutRounding) && Objects.equals(roundedUp, withoutRounding)) { + return withoutRounding; + } + } + + // We have to take a slow path. + // return Double.parseDouble(str.toString()); + outDouble = Double.NaN; + + } else if (FASTFLOAT_DEC_SMALLEST_POWER <= exponent && exponent <= FASTFLOAT_DEC_LARGEST_POWER) { + outDouble = tryDecToDoubleWithFastAlgorithm(isNegative, digits, exponent); + } else { + outDouble = Double.NaN; + } + return outDouble; + } + + /** + * Computes {@code uint128 product = (uint64)x * (uint64)y}. + *

    + * References: + *

    + *
    Getting the high part of 64 bit integer multiplication
    + *
    + * stackoverflow
    + *
    + * + * @param x uint64 factor x + * @param y uint64 factor y + * @return uint128 product of x and y + */ + private static Value128 fullMultiplication(long x, long y) { + long x0 = x & 0xffffffffL, x1 = x >>> 32; + long y0 = y & 0xffffffffL, y1 = y >>> 32; + long p11 = x1 * y1, p01 = x0 * y1; + long p10 = x1 * y0, p00 = x0 * y0; + + // 64-bit product + two 32-bit values + long middle = p10 + (p00 >>> 32) + (p01 & 0xffffffffL); + return new Value128( + // 64-bit product + two 32-bit values + p11 + (middle >>> 32) + (p01 >>> 32), + // Add LOW PART and lower half of MIDDLE PART + (middle << 32) | (p00 & 0xffffffffL)); + } + + static double hexFloatLiteralToDouble(int index, boolean isNegative, long digits, long exponent, + int virtualIndexOfPoint, long exp_number, boolean isDigitsTruncated, int skipCountInTruncatedDigits) { + if (digits == 0) { + return isNegative ? -0.0 : 0.0; + } + final double outDouble; + if (isDigitsTruncated) { + final long truncatedExponent = (virtualIndexOfPoint - index + skipCountInTruncatedDigits) * 4L + + exp_number; + + // We have too many digits. We may have to round up. + // To know whether rounding up is needed, we may have to examine up to 768 digits. + + // There are cases, in which rounding has no effect. + if (FASTFLOAT_HEX_SMALLEST_POWER <= truncatedExponent && truncatedExponent <= FASTFLOAT_HEX_LARGEST_POWER) { + double withoutRounding = tryHexToDoubleWithFastAlgorithm(isNegative, digits, (int) truncatedExponent); + double roundedUp = tryHexToDoubleWithFastAlgorithm(isNegative, digits + 1, (int) truncatedExponent); + if (!Double.isNaN(withoutRounding) && Objects.equals(roundedUp, withoutRounding)) { + return withoutRounding; + } + } + + // We have to take a slow path. + outDouble = Double.NaN; + + } else if (FASTFLOAT_HEX_SMALLEST_POWER <= exponent && exponent <= FASTFLOAT_HEX_LARGEST_POWER) { + outDouble = tryHexToDoubleWithFastAlgorithm(isNegative, digits, (int) exponent); + } else { + outDouble = Double.NaN; + } + return outDouble; + } + + /** + * Attempts to compute {@literal digits * 10^(power)} exactly; and if "negative" is true, negate the result. + *

    + * This function will only work in some cases, when it does not work it returns null. This should work *most of the + * time* (like 99% of the time). We assume that power is in the [FASTFLOAT_SMALLEST_POWER, FASTFLOAT_LARGEST_POWER] + * interval: the caller is responsible for this check. + * + * @param isNegative whether the number is negative + * @param digits uint64 the digits of the number + * @param power int32 the exponent of the number + * @return the computed double on success, {@link Double#NaN} on failure + */ + static double tryDecToDoubleWithFastAlgorithm(boolean isNegative, long digits, int power) { + if (digits == 0 || power < -380 - 19) { + return isNegative ? -0.0 : 0.0; + } + if (power > 380) { + return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; + } + + // we start with a fast path + // It was described in + // Clinger WD. How to read floating point numbers accurately. + // ACM SIGPLAN Notices. 1990 + if (-22 <= power && power <= 22 && Long.compareUnsigned(digits, 0x1fffffffffffffL) <= 0) { + // convert the integer into a double. This is lossless since + // 0 <= i <= 2^53 - 1. + double d = (double) digits; + // + // The general idea is as follows. + // If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then + // 1) Both s and p can be represented exactly as 64-bit floating-point + // values (binary64). + // 2) Because s and p can be represented exactly as floating-point values, + // then s * p and s / p will produce correctly rounded values. + // + if (power < 0) { + d = d / powerOfTen[-power]; + } else { + d = d * powerOfTen[power]; + } + return (isNegative) ? -d : d; + } + + + // The fast path has now failed, so we are falling back on the slower path. + + // We are going to need to do some 64-bit arithmetic to get a more precise product. + // We use a table lookup approach. + // It is safe because + // power >= FASTFLOAT_SMALLEST_POWER + // and power <= FASTFLOAT_LARGEST_POWER + // We recover the mantissa of the power, it has a leading 1. It is always + // rounded down. + long factor_mantissa = MANTISSA_64[power - FASTFLOAT_DEC_SMALLEST_POWER]; + + + // The exponent is 1024 + 63 + power + // + floor(log(5**power)/log(2)). + // The 1024 comes from the ieee64 standard. + // The 63 comes from the fact that we use a 64-bit word. + // + // Computing floor(log(5**power)/log(2)) could be + // slow. Instead we use a fast function. + // + // For power in (-400,350), we have that + // (((152170 + 65536) * power ) >> 16); + // is equal to + // floor(log(5**power)/log(2)) + power when power >= 0 + // and it is equal to + // ceil(log(5**-power)/log(2)) + power when power < 0 + // + // + // The 65536 is (1<<16) and corresponds to + // (65536 * power) >> 16 ---> power + // + // ((152170 * power ) >> 16) is equal to + // floor(log(5**power)/log(2)) + // + // Note that this is not magic: 152170/(1<<16) is + // approximately equal to log(5)/log(2). + // The 1<<16 value is a power of two; we could use a + // larger power of 2 if we wanted to. + // + long exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63; + // We want the most significant bit of digits to be 1. Shift if needed. + int lz = Long.numberOfLeadingZeros(digits); + digits <<= lz; + // We want the most significant 64 bits of the product. We know + // this will be non-zero because the most significant bit of i is + // 1. + Value128 product = fullMultiplication(digits, factor_mantissa); + long lower = product.low; + long upper = product.high; + // We know that upper has at most one leading zero because + // both i and factor_mantissa have a leading one. This means + // that the result is at least as large as ((1<<63)*(1<<63))/(1<<64). + + // As long as the first 9 bits of "upper" are not "1", then we + // know that we have an exact computed value for the leading + // 55 bits because any imprecision would play out as a +1, in + // the worst case. + // Having 55 bits is necessary because + // we need 53 bits for the mantissa but we have to have one rounding bit and + // we can waste a bit if the most significant bit of the product is zero. + // We expect this next branch to be rarely taken (say 1% of the time). + // When (upper & 0x1FF) == 0x1FF, it can be common for + // lower + i < lower to be true (proba. much higher than 1%). + if ((upper & 0x1FF) == 0x1FF && Long.compareUnsigned(lower + digits, lower) < 0) { + long factor_mantissa_low = + MANTISSA_128[power - FASTFLOAT_DEC_SMALLEST_POWER]; + // next, we compute the 64-bit x 128-bit multiplication, getting a 192-bit + // result (three 64-bit values) + product = fullMultiplication(digits, factor_mantissa_low); + long product_low = product.low; + long product_middle2 = product.high; + long product_middle1 = lower; + long product_high = upper; + long product_middle = product_middle1 + product_middle2; + if (Long.compareUnsigned(product_middle, product_middle1) < 0) { + product_high++; // overflow carry + } + + + // we want to check whether mantissa *i + i would affect our result + // This does happen, e.g. with 7.3177701707893310e+15 + if (((product_middle + 1 == 0) && ((product_high & 0x1ff) == 0x1ff) && + (product_low + Long.compareUnsigned(digits, product_low) < 0))) { // let us be prudent and bail out. + return Double.NaN; + } + upper = product_high; + // lower = product_middle; + } + + // The final mantissa should be 53 bits with a leading 1. + // We shift it so that it occupies 54 bits with a leading 1. + long upperbit = upper >>> 63; + long mantissa = upper >>> (upperbit + 9); + lz += (int) (1 ^ upperbit); + // Here we have mantissa < (1<<54). + + // We have to round to even. The "to even" part + // is only a problem when we are right in between two floats + // which we guard against. + // If we have lots of trailing zeros, we may fall right between two + // floating-point values. + if (((upper & 0x1ff) == 0x1ff) + || ((upper & 0x1ff) == 0) && (mantissa & 3) == 1) { + // if mantissa & 1 == 1 we might need to round up. + // + // Scenarios: + // 1. We are not in the middle. Then we should round up. + // + // 2. We are right in the middle. Whether we round up depends + // on the last significant bit: if it is "one" then we round + // up (round to even) otherwise, we do not. + // + // So if the last significant bit is 1, we can safely round up. + // Hence we only need to bail out if (mantissa & 3) == 1. + // Otherwise we may need more accuracy or analysis to determine whether + // we are exactly between two floating-point numbers. + // It can be triggered with 1e23. + // Note: because the factor_mantissa and factor_mantissa_low are + // almost always rounded down (except for small positive powers), + // almost always should round up. + return Double.NaN; + } + + mantissa += 1; + mantissa >>>= 1; + + // Here we have mantissa < (1<<53), unless there was an overflow + if (mantissa >= (1L << 53)) { + // This will happen when parsing values such as 7.2057594037927933e+16 + mantissa = (1L << 52); + lz--; // undo previous addition + } + + mantissa &= ~(1L << 52); + long real_exponent = exponent - lz; + // we have to check that real_exponent is in range, otherwise we bail out + if ((real_exponent < 1) || (real_exponent > 2046)) { + return Double.NaN; + } + + long bits = mantissa | real_exponent << 52 + | (isNegative ? 1L << 63 : 0L); + return Double.longBitsToDouble(bits); + } + + /** + * Attempts to compute {@literal digits * 2^(power)} exactly; and if "negative" is true, negate the result. + *

    + * This function will only work in some cases, when it does not work it returns null. + * + * @param isNegative whether the number is negative + * @param digits uint64 the digits of the number + * @param power int32 the exponent of the number + * @return the computed double on success, null on failure + */ + static double tryHexToDoubleWithFastAlgorithm(boolean isNegative, long digits, int power) { + if (digits == 0 || power < Double.MIN_EXPONENT - 54) { + return isNegative ? -0.0 : 0.0; + } + if (power > Double.MAX_EXPONENT) { + return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; + } + + // we start with a fast path + // We try to mimic the fast described by Clinger WD for decimal + // float number literals. How to read floating point numbers accurately. + // ACM SIGPLAN Notices. 1990 + if (Long.compareUnsigned(digits, 0x1fffffffffffffL) <= 0) { + // convert the integer into a double. This is lossless since + // 0 <= i <= 2^53 - 1. + double d = (double) digits; + // + // The general idea is as follows. + // If 0 <= s < 2^53 then + // 1) Both s and p can be represented exactly as 64-bit floating-point + // values (binary64). + // 2) Because s and p can be represented exactly as floating-point values, + // then s * p will produce correctly rounded values. + // + d = d * Math.scalb(1d, power); + if (isNegative) { + d = -d; + } + return d; + } + + // The fast path has failed + return Double.NaN; + } + + private static class Value128 { + + final long high, low; + + private Value128(long high, long low) { + this.high = high; + this.low = low; + } + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/tokenization/external/FastDoubleParser.java b/extensions/csv/src/main/java/io/deephaven/csv/tokenization/external/FastDoubleParser.java new file mode 100644 index 00000000000..5b0f8784c07 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/tokenization/external/FastDoubleParser.java @@ -0,0 +1,541 @@ +/* + * @(#)FastDoubleParser.java Copyright © 2021. Werner Randelshofer, Switzerland. MIT License. + */ + +/* + * TEMPORARY Modifications by kosak: change package name FOR NOW (until we correctly include this jar in our build) + */ + +package io.deephaven.csv.tokenization.external; + +/** + * This is a C++ to Java port of Daniel Lemire's fast_double_parser. + *

    + * The code has been changed, so that it parses the same syntax as {@link Double#parseDouble(String)}. + *

    + * References: + *

    + *
    Daniel Lemire, fast_double_parser, 4x faster than strtod. Apache License 2.0 or Boost Software License.
    + *
    github.com
    + * + *
    Daniel Lemire, fast_float number parsing library: 4x faster than strtod. Apache License 2.0.
    + *
    github.com
    + * + *
    Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51 (8), 2021. + * arXiv.2101.11408v3 [cs.DS] 24 Feb 2021
    + *
    arxiv.org
    + *
    + */ +public class FastDoubleParser { + private final static long MINIMAL_NINETEEN_DIGIT_INTEGER = 1000_00000_00000_00000L; + private final static int MINIMAL_EIGHT_DIGIT_INTEGER = 10_000_000; + /** + * Special value in {@link #CHAR_TO_HEX_MAP} for the decimal point character. + */ + private static final byte DECIMAL_POINT_CLASS = -4; + /** + * Special value in {@link #CHAR_TO_HEX_MAP} for characters that are neither a hex digit nor a decimal point + * character.. + */ + private static final byte OTHER_CLASS = -1; + /** + * A table of 128 entries or of entries up to including character 'p' would suffice. + *

    + * However for some reason, performance is best, if this table has exactly 256 entries. + */ + private static final byte[] CHAR_TO_HEX_MAP = new byte[256]; + + static { + for (char ch = 0; ch < CHAR_TO_HEX_MAP.length; ch++) { + CHAR_TO_HEX_MAP[ch] = OTHER_CLASS; + } + for (char ch = '0'; ch <= '9'; ch++) { + CHAR_TO_HEX_MAP[ch] = (byte) (ch - '0'); + } + for (char ch = 'A'; ch <= 'F'; ch++) { + CHAR_TO_HEX_MAP[ch] = (byte) (ch - 'A' + 10); + } + for (char ch = 'a'; ch <= 'f'; ch++) { + CHAR_TO_HEX_MAP[ch] = (byte) (ch - 'a' + 10); + } + for (char ch = '.'; ch <= '.'; ch++) { + CHAR_TO_HEX_MAP[ch] = DECIMAL_POINT_CLASS; + } + } + + /** + * Prevents instantiation. + */ + private FastDoubleParser() { + + } + + private static boolean isDigit(char c) { + return '0' <= c && c <= '9'; + } + + private static NumberFormatException newNumberFormatException(CharSequence str) { + if (str.length() > 1024) { + // str can be up to Integer.MAX_VALUE characters long + return new NumberFormatException("For input string of length " + str.length()); + } else { + return new NumberFormatException("For input string: \"" + str.toString().trim() + "\""); + } + } + + /** + * Convenience method for calling {@link #parseDouble(CharSequence, int, int)}. + * + * @param str the string to be parsed + * @return the parsed double value + * @throws NumberFormatException if the string can not be parsed + */ + public static double parseDouble(CharSequence str) throws NumberFormatException { + return parseDouble(str, 0, str.length()); + } + + /** + * Returns a Double object holding the double value represented by the argument string {@code str}. + *

    + * This method can be used as a drop in for method {@link Double#valueOf(String)}. (Assuming that the API of this + * method has not changed since Java SE 16). + *

    + * Leading and trailing whitespace characters in {@code str} are ignored. Whitespace is removed as if by the + * {@link String#trim()} method; that is, characters in the range [U+0000,U+0020]. + *

    + * The rest of {@code str} should constitute a FloatValue as described by the lexical syntax rules shown below: + *

    + *
    + *
    FloatValue: + *
    [Sign] {@code NaN} + *
    [Sign] {@code Infinity} + *
    [Sign] DecimalFloatingPointLiteral + *
    [Sign] HexFloatingPointLiteral + *
    SignedInteger + *
    + * + *
    + *
    HexFloatingPointLiteral: + *
    HexSignificand BinaryExponent + *
    + * + *
    + *
    HexSignificand: + *
    HexNumeral + *
    HexNumeral {@code .} + *
    {@code 0x} [HexDigits] {@code .} HexDigits + *
    {@code 0X} [HexDigits] {@code .} HexDigits + *
    + * + *
    + *
    HexSignificand: + *
    HexNumeral + *
    HexNumeral {@code .} + *
    {@code 0x} [HexDigits] {@code .} HexDigits + *
    {@code 0X} [HexDigits] {@code .} HexDigits + *
    + * + *
    + *
    BinaryExponent: + *
    BinaryExponentIndicator SignedInteger + *
    + * + *
    + *
    BinaryExponentIndicator: + *
    {@code p} + *
    {@code P} + *
    + * + *
    + *
    DecimalFloatingPointLiteral: + *
    Digits {@code .} [Digits] [ExponentPart] + *
    {@code .} Digits [ExponentPart] + *
    Digits ExponentPart + *
    + * + *
    + *
    ExponentPart: + *
    ExponentIndicator SignedInteger + *
    + * + *
    + *
    ExponentIndicator: + *
    (one of) + *
    e E + *
    + * + *
    + *
    SignedInteger: + *
    [Sign] Digits + *
    + * + *
    + *
    Sign: + *
    (one of) + *
    + - + *
    + * + *
    + *
    Digits: + *
    Digit {Digit} + *
    + * + *
    + *
    HexNumeral: + *
    {@code 0} {@code x} HexDigits + *
    {@code 0} {@code X} HexDigits + *
    + * + *
    + *
    HexDigits: + *
    HexDigit {HexDigit} + *
    + * + *
    + *
    HexDigit: + *
    (one of) + *
    {@code 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F} + *
    + *
    + * + * @param str the string to be parsed + * @param offset The index of the first character to parse + * @param length The number of characters to parse + * @return the parsed double value + * @throws NumberFormatException if the string can not be parsed + */ + public static double parseDouble(CharSequence str, int offset, int length) throws NumberFormatException { + final int endIndex = offset + length; + + // Skip leading whitespace + // ------------------- + int index = skipWhitespace(str, offset, endIndex); + if (index == endIndex) { + throw new NumberFormatException("empty String"); + } + char ch = str.charAt(index); + + // Parse optional sign + // ------------------- + final boolean isNegative = ch == '-'; + if (isNegative || ch == '+') { + ch = ++index < endIndex ? str.charAt(index) : 0; + if (ch == 0) { + throw newNumberFormatException(str); + } + } + + // Parse NaN or Infinity + // --------------------- + if (ch == 'N') { + return parseNaN(str, index, endIndex); + } else if (ch == 'I') { + return parseInfinity(str, index, endIndex, isNegative); + } + + // Parse optional leading zero + // --------------------------- + final boolean hasLeadingZero = ch == '0'; + if (hasLeadingZero) { + ch = ++index < endIndex ? str.charAt(index) : 0; + if (ch == 'x' || ch == 'X') { + return parseRestOfHexFloatingPointLiteral(str, index + 1, endIndex, isNegative); + } + } + + return parseRestOfDecimalFloatLiteral(str, endIndex, index, isNegative, hasLeadingZero); + } + + private static double parseInfinity(CharSequence str, int index, int endIndex, boolean negative) { + if (index + 7 < endIndex + // && str.charAt(index) == 'I' + && str.charAt(index + 1) == 'n' + && str.charAt(index + 2) == 'f' + && str.charAt(index + 3) == 'i' + && str.charAt(index + 4) == 'n' + && str.charAt(index + 5) == 'i' + && str.charAt(index + 6) == 't' + && str.charAt(index + 7) == 'y') { + index = skipWhitespace(str, index + 8, endIndex); + if (index < endIndex) { + throw newNumberFormatException(str); + } + return negative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; + } else { + throw newNumberFormatException(str); + } + } + + private static double parseNaN(CharSequence str, int index, int endIndex) { + if (index + 2 < endIndex + // && str.charAt(index) == 'N' + && str.charAt(index + 1) == 'a' + && str.charAt(index + 2) == 'N') { + + index = skipWhitespace(str, index + 3, endIndex); + if (index < endIndex) { + throw newNumberFormatException(str); + } + + return Double.NaN; + } else { + throw newNumberFormatException(str); + } + } + + /** + * Parses the following rules (more rules are defined in {@link #parseDouble(CharSequence)}): + *
    + *
    RestOfDecimalFloatingPointLiteral: + *
    [Digits] {@code .} [Digits] [ExponentPart] + *
    {@code .} Digits [ExponentPart] + *
    [Digits] ExponentPart + *
    + * + * @param str the input string + * @param endIndex the end index of the string + * @param index index to the first character of RestOfHexFloatingPointLiteral + * @param isNegative if the resulting number is negative + * @param hasLeadingZero if the digit '0' has been consumed + * @return a double representation + */ + private static double parseRestOfDecimalFloatLiteral(CharSequence str, int endIndex, int index, boolean isNegative, + boolean hasLeadingZero) { + // Parse digits + // ------------ + // Note: a multiplication by a constant is cheaper than an + // arbitrary integer multiplication. + long digits = 0;// digits is treated as an unsigned long + int exponent = 0; + final int indexOfFirstDigit = index; + int virtualIndexOfPoint = -1; + final int digitCount; + char ch = 0; + for (; index < endIndex; index++) { + ch = str.charAt(index); + if (isDigit(ch)) { + // This might overflow, we deal with it later. + digits = 10 * digits + ch - '0'; + } else if (ch == '.') { + if (virtualIndexOfPoint != -1) { + throw newNumberFormatException(str); + } + virtualIndexOfPoint = index; + } else { + break; + } + } + final int indexAfterDigits = index; + if (virtualIndexOfPoint == -1) { + digitCount = indexAfterDigits - indexOfFirstDigit; + virtualIndexOfPoint = indexAfterDigits; + } else { + digitCount = indexAfterDigits - indexOfFirstDigit - 1; + exponent = virtualIndexOfPoint - index + 1; + } + + // Parse exponent number + // --------------------- + long exp_number = 0; + final boolean hasExponent = (ch == 'e') || (ch == 'E'); + if (hasExponent) { + ch = ++index < endIndex ? str.charAt(index) : 0; + boolean neg_exp = ch == '-'; + if (neg_exp || ch == '+') { + ch = ++index < endIndex ? str.charAt(index) : 0; + } + if (!isDigit(ch)) { + throw newNumberFormatException(str); + } + do { + // Guard against overflow of exp_number + if (exp_number < MINIMAL_EIGHT_DIGIT_INTEGER) { + exp_number = 10 * exp_number + ch - '0'; + } + ch = ++index < endIndex ? str.charAt(index) : 0; + } while (isDigit(ch)); + if (neg_exp) { + exp_number = -exp_number; + } + exponent += exp_number; + } + + // Skip trailing whitespace + // ------------------------ + index = skipWhitespace(str, index, endIndex); + if (index < endIndex + || !hasLeadingZero && digitCount == 0 && str.charAt(virtualIndexOfPoint) != '.') { + throw newNumberFormatException(str); + } + + // Re-parse digits in case of a potential overflow + // ----------------------------------------------- + final boolean isDigitsTruncated; + int skipCountInTruncatedDigits = 0;// counts +1 if we skipped over the decimal point + if (digitCount > 19) { + digits = 0; + for (index = indexOfFirstDigit; index < indexAfterDigits; index++) { + ch = str.charAt(index); + if (ch == '.') { + skipCountInTruncatedDigits++; + } else { + if (Long.compareUnsigned(digits, MINIMAL_NINETEEN_DIGIT_INTEGER) < 0) { + digits = 10 * digits + ch - '0'; + } else { + break; + } + } + } + isDigitsTruncated = (index < indexAfterDigits); + } else { + isDigitsTruncated = false; + } + double result = FastDoubleMath.decFloatLiteralToDouble(index, isNegative, digits, exponent, virtualIndexOfPoint, + exp_number, isDigitsTruncated, skipCountInTruncatedDigits); + return Double.isNaN(result) ? parseRestOfDecimalFloatLiteralTheHardWay(str) : result; + } + + /** + * Parses the following rules (more rules are defined in {@link #parseDouble(CharSequence)}): + *
    + *
    RestOfDecimalFloatingPointLiteral: + *
    [Digits] {@code .} [Digits] [ExponentPart] + *
    {@code .} Digits [ExponentPart] + *
    [Digits] ExponentPart + *
    + * + * @param str the input string + */ + private static double parseRestOfDecimalFloatLiteralTheHardWay(CharSequence str) { + return Double.parseDouble(str.toString()); + } + + /** + * Parses the following rules (more rules are defined in {@link #parseDouble(CharSequence)}): + *
    + *
    RestOfHexFloatingPointLiteral: + *
    RestOfHexSignificand BinaryExponent + *
    + * + *
    + *
    RestOfHexSignificand: + *
    HexDigits + *
    HexDigits {@code .} + *
    [HexDigits] {@code .} HexDigits + *
    + * + * @param str the input string + * @param index index to the first character of RestOfHexFloatingPointLiteral + * @param endIndex the end index of the string + * @param isNegative if the resulting number is negative + * @return a double representation + */ + private static double parseRestOfHexFloatingPointLiteral( + CharSequence str, int index, int endIndex, boolean isNegative) { + + // Parse digits + // ------------ + long digits = 0;// digits is treated as an unsigned long + int exponent = 0; + final int indexOfFirstDigit = index; + int virtualIndexOfPoint = -1; + final int digitCount; + char ch = 0; + for (; index < endIndex; index++) { + ch = str.charAt(index); + // Table look up is faster than a sequence of if-else-branches. + int hexValue = ch > 127 ? OTHER_CLASS : CHAR_TO_HEX_MAP[ch]; + if (hexValue >= 0) { + digits = (digits << 4) | hexValue;// This might overflow, we deal with it later. + } else if (hexValue == DECIMAL_POINT_CLASS) { + if (virtualIndexOfPoint != -1) { + throw newNumberFormatException(str); + } + virtualIndexOfPoint = index; + } else { + break; + } + } + final int indexAfterDigits = index; + if (virtualIndexOfPoint == -1) { + digitCount = indexAfterDigits - indexOfFirstDigit; + virtualIndexOfPoint = indexAfterDigits; + } else { + digitCount = indexAfterDigits - indexOfFirstDigit - 1; + exponent = Math.min(virtualIndexOfPoint - index + 1, MINIMAL_EIGHT_DIGIT_INTEGER) * 4; + } + + // Parse exponent number + // --------------------- + long exp_number = 0; + final boolean hasExponent = (ch == 'p') || (ch == 'P'); + if (hasExponent) { + ch = ++index < endIndex ? str.charAt(index) : 0; + boolean neg_exp = ch == '-'; + if (neg_exp || ch == '+') { + ch = ++index < endIndex ? str.charAt(index) : 0; + } + if (!isDigit(ch)) { + throw newNumberFormatException(str); + } + do { + // Guard against overflow of exp_number + if (exp_number < MINIMAL_EIGHT_DIGIT_INTEGER) { + exp_number = 10 * exp_number + ch - '0'; + } + ch = ++index < endIndex ? str.charAt(index) : 0; + } while (isDigit(ch)); + if (neg_exp) { + exp_number = -exp_number; + } + exponent += exp_number; + } + + // Skip trailing whitespace + // ------------------------ + index = skipWhitespace(str, index, endIndex); + if (index < endIndex + || digitCount == 0 && str.charAt(virtualIndexOfPoint) != '.' + || !hasExponent) { + throw newNumberFormatException(str); + } + + // Re-parse digits in case of a potential overflow + // ----------------------------------------------- + final boolean isDigitsTruncated; + int skipCountInTruncatedDigits = 0;// counts +1 if we skipped over the decimal point + if (digitCount > 16) { + digits = 0; + for (index = indexOfFirstDigit; index < indexAfterDigits; index++) { + ch = str.charAt(index); + // Table look up is faster than a sequence of if-else-branches. + int hexValue = ch > 127 ? OTHER_CLASS : CHAR_TO_HEX_MAP[ch]; + if (hexValue >= 0) { + if (Long.compareUnsigned(digits, MINIMAL_NINETEEN_DIGIT_INTEGER) < 0) { + digits = (digits << 4) | hexValue; + } else { + break; + } + } else { + skipCountInTruncatedDigits++; + } + } + isDigitsTruncated = (index < indexAfterDigits); + } else { + isDigitsTruncated = false; + } + + double d = FastDoubleMath.hexFloatLiteralToDouble(index, isNegative, digits, exponent, virtualIndexOfPoint, + exp_number, isDigitsTruncated, skipCountInTruncatedDigits); + return Double.isNaN(d) ? Double.parseDouble(str.toString()) : d; + } + + private static int skipWhitespace(CharSequence str, int index, int endIndex) { + for (; index < endIndex; index++) { + if (str.charAt(index) > 0x20) { + break; + } + } + return index; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/tokenization/external/FastDoubleParserFromByteArray.java b/extensions/csv/src/main/java/io/deephaven/csv/tokenization/external/FastDoubleParserFromByteArray.java new file mode 100644 index 00000000000..f99d5874f1e --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/tokenization/external/FastDoubleParserFromByteArray.java @@ -0,0 +1,589 @@ +/* + * @(#)FastDoubleParser.java Copyright © 2021. Werner Randelshofer, Switzerland. MIT License. + */ + +/* + * TEMPORARY Modifications by kosak: change package name FOR NOW (until we correctly include this jar in our build) + */ + +package io.deephaven.csv.tokenization.external; + +import java.lang.invoke.MethodHandles; +import java.lang.invoke.VarHandle; +import java.nio.ByteOrder; +import java.nio.charset.StandardCharsets; + +/** + * This is a C++ to Java port of Daniel Lemire's fast_double_parser. + *

    + * The code has been changed, so that it parses the same syntax as {@link Double#parseDouble(String)}. + *

    + * References: + *

    + *
    Daniel Lemire, fast_double_parser, 4x faster than strtod. Apache License 2.0 or Boost Software License.
    + *
    github.com
    + * + *
    Daniel Lemire, fast_float number parsing library: 4x faster than strtod. Apache License 2.0.
    + *
    github.com
    + * + *
    Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51 (8), 2021. + * arXiv.2101.11408v3 [cs.DS] 24 Feb 2021
    + *
    arxiv.org
    + *
    + */ +public class FastDoubleParserFromByteArray { + private final static long MINIMAL_NINETEEN_DIGIT_INTEGER = 1000_00000_00000_00000L; + private final static int MINIMAL_EIGHT_DIGIT_INTEGER = 10_000_000; + /** + * Special value in {@link #CHAR_TO_HEX_MAP} for the decimal point character. + */ + private static final byte DECIMAL_POINT_CLASS = -4; + /** + * Special value in {@link #CHAR_TO_HEX_MAP} for characters that are neither a hex digit nor a decimal point + * character.. + */ + private static final byte OTHER_CLASS = -1; + /** + * A table of 128 entries or of entries up to including character 'p' would suffice. + *

    + * However for some reason, performance is best, if this table has exactly 256 entries. + */ + private static final byte[] CHAR_TO_HEX_MAP = new byte[256]; + private final static VarHandle readLongFromByteArray = + MethodHandles.byteArrayViewVarHandle(long[].class, ByteOrder.LITTLE_ENDIAN); + + static { + for (char ch = 0; ch < CHAR_TO_HEX_MAP.length; ch++) { + CHAR_TO_HEX_MAP[ch] = OTHER_CLASS; + } + for (char ch = '0'; ch <= '9'; ch++) { + CHAR_TO_HEX_MAP[ch] = (byte) (ch - '0'); + } + for (char ch = 'A'; ch <= 'F'; ch++) { + CHAR_TO_HEX_MAP[ch] = (byte) (ch - 'A' + 10); + } + for (char ch = 'a'; ch <= 'f'; ch++) { + CHAR_TO_HEX_MAP[ch] = (byte) (ch - 'a' + 10); + } + for (char ch = '.'; ch <= '.'; ch++) { + CHAR_TO_HEX_MAP[ch] = DECIMAL_POINT_CLASS; + } + } + + /** + * Prevents instantiation. + */ + private FastDoubleParserFromByteArray() { + + } + + private static boolean isDigit(byte c) { + return (byte) '0' <= c && c <= (byte) '9'; + } + + private static NumberFormatException newNumberFormatException(byte[] str, int off, int len) { + if (len > 1024) { + // str can be up to Integer.MAX_VALUE characters long + return new NumberFormatException("For input string of length " + len); + } else { + return new NumberFormatException( + "For input string: \"" + new String(str, off, len, StandardCharsets.ISO_8859_1) + "\""); + } + } + + /** + * Convenience method for calling {@link #parseDouble(byte[], int, int)}. + * + * @param str the string to be parsed, a byte array with characters in ISO-8859-1, ASCII or UTF-8 encoding + * @return the parsed double value + * @throws NumberFormatException if the string can not be parsed + */ + public static double parseDouble(byte[] str) throws NumberFormatException { + return parseDouble(str, 0, str.length); + } + + /** + * Returns a Double object holding the double value represented by the argument string {@code str}. + *

    + * This method can be used as a drop in for method {@link Double#valueOf(String)}. (Assuming that the API of this + * method has not changed since Java SE 16). + *

    + * Leading and trailing whitespace characters in {@code str} are ignored. Whitespace is removed as if by the + * {@link String#trim()} method; that is, characters in the range [U+0000,U+0020]. + *

    + * The rest of {@code str} should constitute a FloatValue as described by the lexical syntax rules shown below: + *

    + *
    + *
    FloatValue: + *
    [Sign] {@code NaN} + *
    [Sign] {@code Infinity} + *
    [Sign] DecimalFloatingPointLiteral + *
    [Sign] HexFloatingPointLiteral + *
    SignedInteger + *
    + * + *
    + *
    HexFloatingPointLiteral: + *
    HexSignificand BinaryExponent + *
    + * + *
    + *
    HexSignificand: + *
    HexNumeral + *
    HexNumeral {@code .} + *
    {@code 0x} [HexDigits] {@code .} HexDigits + *
    {@code 0X} [HexDigits] {@code .} HexDigits + *
    + * + *
    + *
    HexSignificand: + *
    HexNumeral + *
    HexNumeral {@code .} + *
    {@code 0x} [HexDigits] {@code .} HexDigits + *
    {@code 0X} [HexDigits] {@code .} HexDigits + *
    + * + *
    + *
    BinaryExponent: + *
    BinaryExponentIndicator SignedInteger + *
    + * + *
    + *
    BinaryExponentIndicator: + *
    {@code p} + *
    {@code P} + *
    + * + *
    + *
    DecimalFloatingPointLiteral: + *
    Digits {@code .} [Digits] [ExponentPart] + *
    {@code .} Digits [ExponentPart] + *
    Digits ExponentPart + *
    + * + *
    + *
    ExponentPart: + *
    ExponentIndicator SignedInteger + *
    + * + *
    + *
    ExponentIndicator: + *
    (one of) + *
    e E + *
    + * + *
    + *
    SignedInteger: + *
    [Sign] Digits + *
    + * + *
    + *
    Sign: + *
    (one of) + *
    + - + *
    + * + *
    + *
    Digits: + *
    Digit {Digit} + *
    + * + *
    + *
    HexNumeral: + *
    {@code 0} {@code x} HexDigits + *
    {@code 0} {@code X} HexDigits + *
    + * + *
    + *
    HexDigits: + *
    HexDigit {HexDigit} + *
    + * + *
    + *
    HexDigit: + *
    (one of) + *
    {@code 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F} + *
    + *
    + * + * @param str the string to be parsed, a byte array with characters in ISO-8859-1, ASCII or UTF-8 encoding + * @param off The index of the first byte to parse + * @param len The number of bytes to parse + * @return the parsed double value + * @throws NumberFormatException if the string can not be parsed + */ + public static double parseDouble(byte[] str, int off, int len) throws NumberFormatException { + final int endIndex = len + off; + + // Skip leading whitespace + // ------------------- + int index = skipWhitespace(str, off, endIndex); + if (index == endIndex) { + throw new NumberFormatException("empty String"); + } + byte ch = str[index]; + + // Parse optional sign + // ------------------- + final boolean isNegative = ch == '-'; + if (isNegative || ch == '+') { + ch = ++index < endIndex ? str[index] : 0; + if (ch == 0) { + throw newNumberFormatException(str, off, len); + } + } + + // Parse NaN or Infinity + // --------------------- + if (ch == 'N') { + return parseNaN(str, index, endIndex, off); + } else if (ch == 'I') { + return parseInfinity(str, index, endIndex, isNegative, off); + } + + // Parse optional leading zero + // --------------------------- + final boolean hasLeadingZero = ch == '0'; + if (hasLeadingZero) { + ch = ++index < endIndex ? str[index] : 0; + if (ch == 'x' || ch == 'X') { + return parseRestOfHexFloatingPointLiteral(str, index + 1, endIndex, isNegative, off); + } + } + + return parseRestOfDecimalFloatLiteral(str, endIndex, index, isNegative, hasLeadingZero, off); + } + + /** + * Tries to parse eight digits from a byte array provided in a long. + * + * @param value an array of 8 bytes in a long + * @return the parsed digits or -1 on failure + */ + private static int tryToParseEightDigits(long value) { + long val = value - 0x3030303030303030L; + long l = ((value + 0x4646464646464646L) | val) & + 0x8080808080808080L; + if (l != 0L) { + return -1; + } + + + long mask = 0x000000FF000000FFL; + long mul1 = 0x000F424000000064L; // 100 + (1000000ULL << 32) + long mul2 = 0x0000271000000001L; // 1 + (10000ULL << 32) + val = (val * 10) + (val >>> 8); // val = (val * 2561) >> 8; + val = (((val & mask) * mul1) + (((val >>> 16) & mask) * mul2)) >>> 32; + return (int) (val); + } + + private static double parseInfinity(byte[] str, int index, int endIndex, boolean negative, int off) { + if (index + 7 < endIndex + // && str.charAt(index) == 'I' + && str[index + 1] == (byte) 'n' + && str[index + 2] == (byte) 'f' + && str[index + 3] == (byte) 'i' + && str[index + 4] == (byte) 'n' + && str[index + 5] == (byte) 'i' + && str[index + 6] == (byte) 't' + && str[index + 7] == (byte) 'y') { + index = skipWhitespace(str, index + 8, endIndex); + if (index < endIndex) { + throw newNumberFormatException(str, off, endIndex - off); + } + return negative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; + } else { + throw newNumberFormatException(str, off, endIndex - off); + } + } + + private static double parseNaN(byte[] str, int index, int endIndex, int off) { + if (index + 2 < endIndex + // && str.charAt(index) == 'N' + && str[index + 1] == (byte) 'a' + && str[index + 2] == (byte) 'N') { + + index = skipWhitespace(str, index + 3, endIndex); + if (index < endIndex) { + throw newNumberFormatException(str, off, endIndex - off); + } + + return Double.NaN; + } else { + throw newNumberFormatException(str, off, endIndex - off); + } + } + + /** + * Parses the following rules (more rules are defined in {@link #parseDouble}): + *
    + *
    RestOfDecimalFloatingPointLiteral: + *
    [Digits] {@code .} [Digits] [ExponentPart] + *
    {@code .} Digits [ExponentPart] + *
    [Digits] ExponentPart + *
    + * + * @param str the input string + * @param endIndex the end index of the string + * @param index index to the first character of RestOfHexFloatingPointLiteral + * @param isNegative if the resulting number is negative + * @param hasLeadingZero if the digit '0' has been consumed + * @return a double representation + */ + private static double parseRestOfDecimalFloatLiteral(byte[] str, int endIndex, int index, boolean isNegative, + boolean hasLeadingZero, int off) { + // Parse digits + // ------------ + // Note: a multiplication by a constant is cheaper than an + // arbitrary integer multiplication. + long digits = 0;// digits is treated as an unsigned long + int exponent = 0; + final int indexOfFirstDigit = index; + int virtualIndexOfPoint = -1; + final int digitCount; + byte ch = 0; + for (; index < endIndex; index++) { + ch = str[index]; + if (isDigit(ch)) { + // This might overflow, we deal with it later. + digits = 10 * digits + ch - '0'; + } else if (ch == '.') { + if (virtualIndexOfPoint != -1) { + throw newNumberFormatException(str, off, endIndex - off); + } + virtualIndexOfPoint = index; + while (index < endIndex - 9) { + long val = (long) readLongFromByteArray.get(str, index + 1); + int parsed = tryToParseEightDigits(val); + if (parsed >= 0) { + // This might overflow, we deal with it later. + digits = digits * 100_000_000L + parsed; + index += 8; + } else { + break; + } + } + } else { + break; + } + } + final int indexAfterDigits = index; + if (virtualIndexOfPoint == -1) { + digitCount = indexAfterDigits - indexOfFirstDigit; + virtualIndexOfPoint = indexAfterDigits; + } else { + digitCount = indexAfterDigits - indexOfFirstDigit - 1; + exponent = virtualIndexOfPoint - index + 1; + } + + // Parse exponent number + // --------------------- + long exp_number = 0; + final boolean hasExponent = (ch == 'e') || (ch == 'E'); + if (hasExponent) { + ch = ++index < endIndex ? str[index] : 0; + boolean neg_exp = ch == '-'; + if (neg_exp || ch == '+') { + ch = ++index < endIndex ? str[index] : 0; + } + if (!isDigit(ch)) { + throw newNumberFormatException(str, off, endIndex - off); + } + do { + // Guard against overflow of exp_number + if (exp_number < MINIMAL_EIGHT_DIGIT_INTEGER) { + exp_number = 10 * exp_number + ch - '0'; + } + ch = ++index < endIndex ? str[index] : 0; + } while (isDigit(ch)); + if (neg_exp) { + exp_number = -exp_number; + } + exponent += exp_number; + } + + // Skip trailing whitespace + // ------------------------ + index = skipWhitespace(str, index, endIndex); + if (index < endIndex + || !hasLeadingZero && digitCount == 0 && str[virtualIndexOfPoint] != '.') { + throw newNumberFormatException(str, off, endIndex - off); + } + + // Re-parse digits in case of a potential overflow + // ----------------------------------------------- + final boolean isDigitsTruncated; + int skipCountInTruncatedDigits = 0;// counts +1 if we skipped over the decimal point + if (digitCount > 19) { + digits = 0; + for (index = indexOfFirstDigit; index < indexAfterDigits; index++) { + ch = str[index]; + if (ch == '.') { + skipCountInTruncatedDigits++; + } else { + if (Long.compareUnsigned(digits, MINIMAL_NINETEEN_DIGIT_INTEGER) < 0) { + digits = 10 * digits + ch - '0'; + } else { + break; + } + } + } + isDigitsTruncated = index < indexAfterDigits; + } else { + isDigitsTruncated = false; + } + + double result = FastDoubleMath.decFloatLiteralToDouble(index, isNegative, digits, exponent, virtualIndexOfPoint, + exp_number, isDigitsTruncated, skipCountInTruncatedDigits); + return Double.isNaN(result) ? parseRestOfDecimalFloatLiteralTheHardWay(str, off, endIndex - off) : result; + } + + /** + * Parses the following rules (more rules are defined in {@link #parseDouble}): + *
    + *
    RestOfDecimalFloatingPointLiteral: + *
    [Digits] {@code .} [Digits] [ExponentPart] + *
    {@code .} Digits [ExponentPart] + *
    [Digits] ExponentPart + *
    + * + * @param str the input string + */ + private static double parseRestOfDecimalFloatLiteralTheHardWay(byte[] str, int off, int len) { + return Double.parseDouble(new String(str, off, len, StandardCharsets.ISO_8859_1)); + } + + /** + * Parses the following rules (more rules are defined in {@link #parseDouble}): + *
    + *
    RestOfHexFloatingPointLiteral: + *
    RestOfHexSignificand BinaryExponent + *
    + * + *
    + *
    RestOfHexSignificand: + *
    HexDigits + *
    HexDigits {@code .} + *
    [HexDigits] {@code .} HexDigits + *
    + * + * @param str the input string + * @param index index to the first character of RestOfHexFloatingPointLiteral + * @param endIndex the end index of the string + * @param isNegative if the resulting number is negative + * @param off offset from the start where character of interest start + * @return a double representation + */ + private static double parseRestOfHexFloatingPointLiteral( + byte[] str, int index, int endIndex, boolean isNegative, int off) { + if (index >= endIndex) { + throw newNumberFormatException(str, off, endIndex - off); + } + + // Parse digits + // ------------ + long digits = 0;// digits is treated as an unsigned long + int exponent = 0; + final int indexOfFirstDigit = index; + int virtualIndexOfPoint = -1; + final int digitCount; + byte ch = 0; + for (; index < endIndex; index++) { + ch = str[index]; + // Table look up is faster than a sequence of if-else-branches. + int hexValue = ch < 0 ? OTHER_CLASS : CHAR_TO_HEX_MAP[ch]; + if (hexValue >= 0) { + digits = (digits << 4) | hexValue;// This might overflow, we deal with it later. + } else if (hexValue == DECIMAL_POINT_CLASS) { + if (virtualIndexOfPoint != -1) { + throw newNumberFormatException(str, off, endIndex - off); + } + virtualIndexOfPoint = index; + } else { + break; + } + } + final int indexAfterDigits = index; + if (virtualIndexOfPoint == -1) { + digitCount = indexAfterDigits - indexOfFirstDigit; + virtualIndexOfPoint = indexAfterDigits; + } else { + digitCount = indexAfterDigits - indexOfFirstDigit - 1; + exponent = Math.min(virtualIndexOfPoint - index + 1, MINIMAL_EIGHT_DIGIT_INTEGER) * 4; + } + + // Parse exponent number + // --------------------- + long exp_number = 0; + final boolean hasExponent = (ch == 'p') || (ch == 'P'); + if (hasExponent) { + ch = ++index < endIndex ? str[index] : 0; + boolean neg_exp = ch == '-'; + if (neg_exp || ch == '+') { + ch = ++index < endIndex ? str[index] : 0; + } + if (!isDigit(ch)) { + throw newNumberFormatException(str, off, endIndex - off); + } + do { + // Guard against overflow of exp_number + if (exp_number < MINIMAL_EIGHT_DIGIT_INTEGER) { + exp_number = 10 * exp_number + ch - '0'; + } + ch = ++index < endIndex ? str[index] : 0; + } while (isDigit(ch)); + if (neg_exp) { + exp_number = -exp_number; + } + exponent += exp_number; + } + + // Skip trailing whitespace + // ------------------------ + index = skipWhitespace(str, index, endIndex); + if (index < endIndex + || digitCount == 0 && str[virtualIndexOfPoint] != '.' + || !hasExponent) { + throw newNumberFormatException(str, off, endIndex - off); + } + + // Re-parse digits in case of a potential overflow + // ----------------------------------------------- + final boolean isDigitsTruncated; + int skipCountInTruncatedDigits = 0;// counts +1 if we skipped over the decimal point + if (digitCount > 16) { + digits = 0; + for (index = indexOfFirstDigit; index < indexAfterDigits; index++) { + ch = str[index]; + // Table look up is faster than a sequence of if-else-branches. + int hexValue = ch < 0 ? OTHER_CLASS : CHAR_TO_HEX_MAP[ch]; + if (hexValue >= 0) { + if (Long.compareUnsigned(digits, MINIMAL_NINETEEN_DIGIT_INTEGER) < 0) { + digits = (digits << 4) | hexValue; + } else { + break; + } + } else { + skipCountInTruncatedDigits++; + } + } + isDigitsTruncated = (index < indexAfterDigits); + } else { + isDigitsTruncated = false; + } + + double d = FastDoubleMath.hexFloatLiteralToDouble(index, isNegative, digits, exponent, virtualIndexOfPoint, + exp_number, isDigitsTruncated, skipCountInTruncatedDigits); + return Double.isNaN(d) ? Double.parseDouble(new String(str, off, endIndex - off)) : d; + } + + private static int skipWhitespace(byte[] str, int index, int endIndex) { + for (; index < endIndex; index++) { + if ((str[index] & 0xff) > 0x20) { + break; + } + } + return index; + } + +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/tokenization/external/FastDoubleParserFromCharArray.java b/extensions/csv/src/main/java/io/deephaven/csv/tokenization/external/FastDoubleParserFromCharArray.java new file mode 100644 index 00000000000..f71193fbfd9 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/tokenization/external/FastDoubleParserFromCharArray.java @@ -0,0 +1,549 @@ +/* + * @(#)FastDoubleParser.java Copyright © 2021. Werner Randelshofer, Switzerland. MIT License. + */ + + +/* + * TEMPORARY Modifications by kosak: change package name FOR NOW (until we correctly include this jar in our build) + */ + + +package io.deephaven.csv.tokenization.external; + +/** + * This is a C++ to Java port of Daniel Lemire's fast_double_parser. + *

    + * The code has been changed, so that it parses the same syntax as {@link Double#parseDouble(String)}. + *

    + * References: + *

    + *
    Daniel Lemire, fast_double_parser, 4x faster than strtod. Apache License 2.0 or Boost Software License.
    + *
    github.com
    + * + *
    Daniel Lemire, fast_float number parsing library: 4x faster than strtod. Apache License 2.0.
    + *
    github.com
    + * + *
    Daniel Lemire, Number Parsing at a Gigabyte per Second, Software: Practice and Experience 51 (8), 2021. + * arXiv.2101.11408v3 [cs.DS] 24 Feb 2021
    + *
    arxiv.org
    + *
    + */ +public class FastDoubleParserFromCharArray { + private final static long MINIMAL_NINETEEN_DIGIT_INTEGER = 1000_00000_00000_00000L; + private final static int MINIMAL_EIGHT_DIGIT_INTEGER = 10_000_000; + /** + * Special value in {@link #CHAR_TO_HEX_MAP} for the decimal point character. + */ + private static final byte DECIMAL_POINT_CLASS = -4; + /** + * Special value in {@link #CHAR_TO_HEX_MAP} for characters that are neither a hex digit nor a decimal point + * character.. + */ + private static final byte OTHER_CLASS = -1; + /** + * A table of 128 entries or of entries up to including character 'p' would suffice. + *

    + * However for some reason, performance is best, if this table has exactly 256 entries. + */ + private static final byte[] CHAR_TO_HEX_MAP = new byte[256]; + + static { + for (char ch = 0; ch < CHAR_TO_HEX_MAP.length; ch++) { + CHAR_TO_HEX_MAP[ch] = OTHER_CLASS; + } + for (char ch = '0'; ch <= '9'; ch++) { + CHAR_TO_HEX_MAP[ch] = (byte) (ch - '0'); + } + for (char ch = 'A'; ch <= 'F'; ch++) { + CHAR_TO_HEX_MAP[ch] = (byte) (ch - 'A' + 10); + } + for (char ch = 'a'; ch <= 'f'; ch++) { + CHAR_TO_HEX_MAP[ch] = (byte) (ch - 'a' + 10); + } + for (char ch = '.'; ch <= '.'; ch++) { + CHAR_TO_HEX_MAP[ch] = DECIMAL_POINT_CLASS; + } + } + + /** + * Prevents instantiation. + */ + private FastDoubleParserFromCharArray() { + + } + + private static boolean isDigit(char c) { + return '0' <= c && c <= '9'; + } + + private static NumberFormatException newNumberFormatException(char[] str, int off, int len) { + if (len > 1024) { + // str can be up to Integer.MAX_VALUE characters long + return new NumberFormatException("For input string of length " + len); + } else { + return new NumberFormatException("For input string: \"" + new String(str, off, len) + "\""); + } + } + + /** + * Convenience method for calling {@link #parseDouble(char[], int, int)}. + * + * @param str the string to be parsed + * @return the parsed double value + * @throws NumberFormatException if the string can not be parsed + */ + public static double parseDouble(char[] str) throws NumberFormatException { + return parseDouble(str, 0, str.length); + } + + /** + * Returns a Double object holding the double value represented by the argument string {@code str}. + *

    + * This method can be used as a drop in for method {@link Double#valueOf(String)}. (Assuming that the API of this + * method has not changed since Java SE 16). + *

    + * Leading and trailing whitespace characters in {@code str} are ignored. Whitespace is removed as if by the + * {@link String#trim()} method; that is, characters in the range [U+0000,U+0020]. + *

    + * The rest of {@code str} should constitute a FloatValue as described by the lexical syntax rules shown below: + *

    + *
    + *
    FloatValue: + *
    [Sign] {@code NaN} + *
    [Sign] {@code Infinity} + *
    [Sign] DecimalFloatingPointLiteral + *
    [Sign] HexFloatingPointLiteral + *
    SignedInteger + *
    + * + *
    + *
    HexFloatingPointLiteral: + *
    HexSignificand BinaryExponent + *
    + * + *
    + *
    HexSignificand: + *
    HexNumeral + *
    HexNumeral {@code .} + *
    {@code 0x} [HexDigits] {@code .} HexDigits + *
    {@code 0X} [HexDigits] {@code .} HexDigits + *
    + * + *
    + *
    HexSignificand: + *
    HexNumeral + *
    HexNumeral {@code .} + *
    {@code 0x} [HexDigits] {@code .} HexDigits + *
    {@code 0X} [HexDigits] {@code .} HexDigits + *
    + * + *
    + *
    BinaryExponent: + *
    BinaryExponentIndicator SignedInteger + *
    + * + *
    + *
    BinaryExponentIndicator: + *
    {@code p} + *
    {@code P} + *
    + * + *
    + *
    DecimalFloatingPointLiteral: + *
    Digits {@code .} [Digits] [ExponentPart] + *
    {@code .} Digits [ExponentPart] + *
    Digits ExponentPart + *
    + * + *
    + *
    ExponentPart: + *
    ExponentIndicator SignedInteger + *
    + * + *
    + *
    ExponentIndicator: + *
    (one of) + *
    e E + *
    + * + *
    + *
    SignedInteger: + *
    [Sign] Digits + *
    + * + *
    + *
    Sign: + *
    (one of) + *
    + - + *
    + * + *
    + *
    Digits: + *
    Digit {Digit} + *
    + * + *
    + *
    HexNumeral: + *
    {@code 0} {@code x} HexDigits + *
    {@code 0} {@code X} HexDigits + *
    + * + *
    + *
    HexDigits: + *
    HexDigit {HexDigit} + *
    + * + *
    + *
    HexDigit: + *
    (one of) + *
    {@code 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F} + *
    + *
    + * + * @param str the string to be parsed, a byte array with characters in ISO-8859-1, ASCII or UTF-8 encoding + * @param off The index of the first character to parse + * @param len The number of characters to parse + * @return the parsed double value + * @throws NumberFormatException if the string can not be parsed + */ + public static double parseDouble(char[] str, int off, int len) throws NumberFormatException { + final int endIndex = len + off; + + // Skip leading whitespace + // ------------------- + int index = skipWhitespace(str, off, endIndex); + if (index == endIndex) { + throw new NumberFormatException("empty String"); + } + char ch = str[index]; + + // Parse optional sign + // ------------------- + final boolean isNegative = ch == '-'; + if (isNegative || ch == '+') { + ch = ++index < endIndex ? str[index] : 0; + if (ch == 0) { + throw newNumberFormatException(str, off, len); + } + } + + // Parse NaN or Infinity + // --------------------- + if (ch == 'N') { + return parseNaN(str, index, endIndex, off); + } else if (ch == 'I') { + return parseInfinity(str, index, endIndex, isNegative, off); + } + + // Parse optional leading zero + // --------------------------- + final boolean hasLeadingZero = ch == '0'; + if (hasLeadingZero) { + ch = ++index < endIndex ? str[index] : 0; + if (ch == 'x' || ch == 'X') { + return parseRestOfHexFloatingPointLiteral(str, index + 1, endIndex, isNegative, off); + } + } + + return parseRestOfDecimalFloatLiteral(str, endIndex, index, isNegative, hasLeadingZero, off); + } + + private static double parseInfinity(char[] str, int index, int endIndex, boolean negative, int off) { + if (index + 7 < endIndex + // && str.charAt(index) == 'I' + && str[index + 1] == 'n' + && str[index + 2] == 'f' + && str[index + 3] == 'i' + && str[index + 4] == 'n' + && str[index + 5] == 'i' + && str[index + 6] == 't' + && str[index + 7] == 'y') { + index = skipWhitespace(str, index + 8, endIndex); + if (index < endIndex) { + throw newNumberFormatException(str, off, endIndex - off); + } + return negative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; + } else { + throw newNumberFormatException(str, off, endIndex - off); + } + } + + private static double parseNaN(char[] str, int index, int endIndex, int off) { + if (index + 2 < endIndex + // && str.charAt(index) == 'N' + && str[index + 1] == 'a' + && str[index + 2] == 'N') { + + index = skipWhitespace(str, index + 3, endIndex); + if (index < endIndex) { + throw newNumberFormatException(str, off, endIndex - off); + } + + return Double.NaN; + } else { + throw newNumberFormatException(str, off, endIndex - off); + } + } + + /** + * Parses the following rules (more rules are defined in {@link #parseDouble}): + *
    + *
    RestOfDecimalFloatingPointLiteral: + *
    [Digits] {@code .} [Digits] [ExponentPart] + *
    {@code .} Digits [ExponentPart] + *
    [Digits] ExponentPart + *
    + * + * @param str the input string + * @param endIndex the end index of the string + * @param index index to the first character of RestOfHexFloatingPointLiteral + * @param isNegative if the resulting number is negative + * @param hasLeadingZero if the digit '0' has been consumed + * @return a double representation + */ + private static double parseRestOfDecimalFloatLiteral(char[] str, int endIndex, int index, boolean isNegative, + boolean hasLeadingZero, int off) { + // Parse digits + // ------------ + // Note: a multiplication by a constant is cheaper than an + // arbitrary integer multiplication. + long digits = 0;// digits is treated as an unsigned long + int exponent = 0; + final int indexOfFirstDigit = index; + int virtualIndexOfPoint = -1; + final int digitCount; + char ch = 0; + for (; index < endIndex; index++) { + ch = str[index]; + if (isDigit(ch)) { + // This might overflow, we deal with it later. + digits = 10 * digits + ch - '0'; + } else if (ch == '.') { + if (virtualIndexOfPoint != -1) { + throw newNumberFormatException(str, off, endIndex - off); + } + virtualIndexOfPoint = index; + } else { + break; + } + } + final int indexAfterDigits = index; + if (virtualIndexOfPoint == -1) { + digitCount = indexAfterDigits - indexOfFirstDigit; + virtualIndexOfPoint = indexAfterDigits; + } else { + digitCount = indexAfterDigits - indexOfFirstDigit - 1; + exponent = virtualIndexOfPoint - index + 1; + } + + // Parse exponent number + // --------------------- + long exp_number = 0; + final boolean hasExponent = (ch == 'e') || (ch == 'E'); + if (hasExponent) { + ch = ++index < endIndex ? str[index] : 0; + boolean neg_exp = ch == '-'; + if (neg_exp || ch == '+') { + ch = ++index < endIndex ? str[index] : 0; + } + if (!isDigit(ch)) { + throw newNumberFormatException(str, off, endIndex - off); + } + do { + // Guard against overflow of exp_number + if (exp_number < MINIMAL_EIGHT_DIGIT_INTEGER) { + exp_number = 10 * exp_number + ch - '0'; + } + ch = ++index < endIndex ? str[index] : 0; + } while (isDigit(ch)); + if (neg_exp) { + exp_number = -exp_number; + } + exponent += exp_number; + } + + // Skip trailing whitespace + // ------------------------ + index = skipWhitespace(str, index, endIndex); + if (index < endIndex + || !hasLeadingZero && digitCount == 0 && str[virtualIndexOfPoint] != '.') { + throw newNumberFormatException(str, off, endIndex - off); + } + + // Re-parse digits in case of a potential overflow + // ----------------------------------------------- + final boolean isDigitsTruncated; + int skipCountInTruncatedDigits = 0;// counts +1 if we skipped over the decimal point + if (digitCount > 19) { + digits = 0; + for (index = indexOfFirstDigit; index < indexAfterDigits; index++) { + ch = str[index]; + if (ch == '.') { + skipCountInTruncatedDigits++; + } else { + if (Long.compareUnsigned(digits, MINIMAL_NINETEEN_DIGIT_INTEGER) < 0) { + digits = 10 * digits + ch - '0'; + } else { + break; + } + } + } + isDigitsTruncated = index < indexAfterDigits; + } else { + isDigitsTruncated = false; + } + + double result = FastDoubleMath.decFloatLiteralToDouble(index, isNegative, digits, exponent, virtualIndexOfPoint, + exp_number, isDigitsTruncated, skipCountInTruncatedDigits); + return Double.isNaN(result) ? parseRestOfDecimalFloatLiteralTheHardWay(str, off, endIndex - off) : result; + } + + /** + * Parses the following rules (more rules are defined in {@link #parseDouble}): + *
    + *
    RestOfDecimalFloatingPointLiteral: + *
    [Digits] {@code .} [Digits] [ExponentPart] + *
    {@code .} Digits [ExponentPart] + *
    [Digits] ExponentPart + *
    + * + * @param str the input string + */ + private static double parseRestOfDecimalFloatLiteralTheHardWay(char[] str, int off, int len) { + return Double.parseDouble(new String(str, off, len)); + } + + /** + * Parses the following rules (more rules are defined in {@link #parseDouble}): + *
    + *
    RestOfHexFloatingPointLiteral: + *
    RestOfHexSignificand BinaryExponent + *
    + * + *
    + *
    RestOfHexSignificand: + *
    HexDigits + *
    HexDigits {@code .} + *
    [HexDigits] {@code .} HexDigits + *
    + * + * @param str the input string + * @param index index to the first character of RestOfHexFloatingPointLiteral + * @param endIndex the end index of the string + * @param isNegative if the resulting number is negative + * @param off offset from the start where character of interest start + * @return a double representation + */ + private static double parseRestOfHexFloatingPointLiteral( + char[] str, int index, int endIndex, boolean isNegative, int off) { + if (index >= endIndex) { + throw newNumberFormatException(str, off, endIndex - off); + } + + // Parse digits + // ------------ + long digits = 0;// digits is treated as an unsigned long + int exponent = 0; + final int indexOfFirstDigit = index; + int virtualIndexOfPoint = -1; + final int digitCount; + char ch = 0; + for (; index < endIndex; index++) { + ch = str[index]; + // Table look up is faster than a sequence of if-else-branches. + int hexValue = ch > 127 ? OTHER_CLASS : CHAR_TO_HEX_MAP[ch]; + if (hexValue >= 0) { + digits = (digits << 4) | hexValue;// This might overflow, we deal with it later. + } else if (hexValue == DECIMAL_POINT_CLASS) { + if (virtualIndexOfPoint != -1) { + throw newNumberFormatException(str, off, endIndex - off); + } + virtualIndexOfPoint = index; + } else { + break; + } + } + final int indexAfterDigits = index; + if (virtualIndexOfPoint == -1) { + digitCount = indexAfterDigits - indexOfFirstDigit; + virtualIndexOfPoint = indexAfterDigits; + } else { + digitCount = indexAfterDigits - indexOfFirstDigit - 1; + exponent = Math.min(virtualIndexOfPoint - index + 1, MINIMAL_EIGHT_DIGIT_INTEGER) * 4; + } + + // Parse exponent number + // --------------------- + long exp_number = 0; + final boolean hasExponent = (ch == 'p') || (ch == 'P'); + if (hasExponent) { + ch = ++index < endIndex ? str[index] : 0; + boolean neg_exp = ch == '-'; + if (neg_exp || ch == '+') { + ch = ++index < endIndex ? str[index] : 0; + } + if (!isDigit(ch)) { + throw newNumberFormatException(str, off, endIndex - off); + } + do { + // Guard against overflow of exp_number + if (exp_number < MINIMAL_EIGHT_DIGIT_INTEGER) { + exp_number = 10 * exp_number + ch - '0'; + } + ch = ++index < endIndex ? str[index] : 0; + } while (isDigit(ch)); + if (neg_exp) { + exp_number = -exp_number; + } + exponent += exp_number; + } + + // Skip trailing whitespace + // ------------------------ + index = skipWhitespace(str, index, endIndex); + if (index < endIndex + || digitCount == 0 && str[virtualIndexOfPoint] != '.' + || !hasExponent) { + throw newNumberFormatException(str, off, endIndex - off); + } + + // Re-parse digits in case of a potential overflow + // ----------------------------------------------- + final boolean isDigitsTruncated; + int skipCountInTruncatedDigits = 0;// counts +1 if we skipped over the decimal point + if (digitCount > 16) { + digits = 0; + for (index = indexOfFirstDigit; index < indexAfterDigits; index++) { + ch = str[index]; + // Table look up is faster than a sequence of if-else-branches. + int hexValue = ch > 127 ? OTHER_CLASS : CHAR_TO_HEX_MAP[ch]; + if (hexValue >= 0) { + if (Long.compareUnsigned(digits, MINIMAL_NINETEEN_DIGIT_INTEGER) < 0) { + digits = (digits << 4) | hexValue; + } else { + break; + } + } else { + skipCountInTruncatedDigits++; + } + } + isDigitsTruncated = (index < indexAfterDigits); + } else { + isDigitsTruncated = false; + } + + double d = FastDoubleMath.hexFloatLiteralToDouble(index, isNegative, digits, exponent, virtualIndexOfPoint, + exp_number, isDigitsTruncated, skipCountInTruncatedDigits); + return Double.isNaN(d) ? Double.parseDouble(new String(str, off, endIndex - off)) : d; + } + + private static int skipWhitespace(char[] str, int index, int endIndex) { + for (; index < endIndex; index++) { + if (str[index] > ' ') { + break; + } + } + return index; + } + +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/util/CsvReaderException.java b/extensions/csv/src/main/java/io/deephaven/csv/util/CsvReaderException.java new file mode 100644 index 00000000000..bc1264ddafc --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/util/CsvReaderException.java @@ -0,0 +1,11 @@ +package io.deephaven.csv.util; + +public class CsvReaderException extends Exception { + public CsvReaderException(String message) { + super(message); + } + + public CsvReaderException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/util/Renderer.java b/extensions/csv/src/main/java/io/deephaven/csv/util/Renderer.java new file mode 100644 index 00000000000..bf029620187 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/util/Renderer.java @@ -0,0 +1,28 @@ +package io.deephaven.csv.util; + +import java.util.function.Function; + +public class Renderer { + public static String renderList(Iterable items) { + return renderList(items, ", ", Object::toString); + } + + public static String renderList(Iterable items, String separator) { + return renderList(items, separator, Object::toString); + } + + public static String renderList(Iterable items, final String separator, Function renderer) { + return renderList(new StringBuilder(), items, separator, renderer).toString(); + } + + public static StringBuilder renderList(StringBuilder sb, Iterable items, final String separator, + Function renderer) { + String separatorToUse = ""; + for (T item : items) { + sb.append(separatorToUse); + sb.append(renderer.apply(item)); + separatorToUse = separator; + } + return sb; + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/util/TimeLogger.java b/extensions/csv/src/main/java/io/deephaven/csv/util/TimeLogger.java new file mode 100644 index 00000000000..115742c8754 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/util/TimeLogger.java @@ -0,0 +1,28 @@ +package io.deephaven.csv.util; + +import java.io.IOException; +import java.time.LocalTime; +import java.time.temporal.ChronoUnit; + +public class TimeLogger implements AutoCloseable { + private final String message; + private final LocalTime start; + + public TimeLogger(String message) { + this.message = message; + start = LocalTime.now(); + System.out.println(start + ": " + message + ": starting"); + } + + public void note(String note) { + final LocalTime current = LocalTime.now(); + final long elapsed = start.until(current, ChronoUnit.MILLIS); + System.out.printf("%s: %s: %s. Elapsed time is %g seconds\n", + current, message, note, elapsed / 1000.0); + } + + @Override + public void close() { + note("Finished"); + } +} diff --git a/extensions/csv/src/test/java/io/deephaven/csv/CsvReaderTest.java b/extensions/csv/src/test/java/io/deephaven/csv/CsvReaderTest.java new file mode 100644 index 00000000000..e367647524a --- /dev/null +++ b/extensions/csv/src/test/java/io/deephaven/csv/CsvReaderTest.java @@ -0,0 +1,1704 @@ +package io.deephaven.csv; + +import gnu.trove.list.array.*; +import io.deephaven.csv.containers.ByteSlice; +import io.deephaven.csv.parsers.IteratorHolder; +import io.deephaven.csv.parsers.Parser; +import io.deephaven.csv.parsers.Parsers; +import io.deephaven.csv.parsers.context.ParseContext; +import io.deephaven.csv.reading.CsvReader; +import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.sinks.SinkFactory; +import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.util.CsvReaderException; +import io.deephaven.csv.util.Renderer; +import org.assertj.core.api.Assertions; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import java.io.*; +import java.lang.reflect.Array; +import java.math.BigDecimal; +import java.time.ZoneOffset; +import java.util.ArrayList; +import java.util.List; +import java.util.function.*; + +@Category(CsvTestCategory.class) +public class CsvReaderTest { + private static class Sentinels { + public static final byte NULL_BOOLEAN_AS_BYTE = Byte.MIN_VALUE; + public static final byte NULL_BYTE = Byte.MIN_VALUE; + public static final short NULL_SHORT = Short.MIN_VALUE; + public static final int NULL_INT = Integer.MIN_VALUE; + public static final long NULL_LONG = Long.MIN_VALUE; + public static final float NULL_FLOAT = -Float.MAX_VALUE; + public static final double NULL_DOUBLE = -Double.MAX_VALUE; + public static final char NULL_CHAR = Character.MAX_VALUE; + public static final String NULL_STRING = null; + public static final long NULL_DATETIME_AS_LONG = Long.MIN_VALUE; + public static final long NULL_TIMESTAMP_AS_LONG = Long.MIN_VALUE; + } + + private static final String BOOLEAN_INPUT = "" + + "Values\n" + + "true\n" + + "\n" + + "false\n" + + "True\n" + + "False\n" + + "TrUe\n" + + "FALSE\n"; + + @Test + public void booleans() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", (byte) 1, Sentinels.NULL_BOOLEAN_AS_BYTE, (byte) 0, (byte) 1, (byte) 0, + (byte) 1, (byte) 0)); + + invokeTest(csvReaderWithNulls(), BOOLEAN_INPUT, expected); + } + + @Test(expected = CsvReaderException.class) + public void unconfiguredNullBoolean() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", (byte) 1, Sentinels.NULL_BOOLEAN_AS_BYTE, (byte) 0, (byte) 1, (byte) 0, + (byte) 1, (byte) 0)); + + invokeTest(defaultCsvReader(), BOOLEAN_INPUT, expected); + } + + private static final String CHAR_INPUT = "" + + "Values\n" + + "A\n" + + "\n" + + "B\n" + + "C\n" + + "1\n" + + "2\n" + + "3\n"; + + @Test + public void chars() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", 'A', Sentinels.NULL_CHAR, 'B', 'C', '1', '2', '3')); + + invokeTest(csvReaderWithNulls(), CHAR_INPUT, expected); + } + + @Test(expected = CsvReaderException.class) + public void unconfiguredNullChar() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", 'A', Sentinels.NULL_CHAR, 'B', 'C', '1', '2', '3')); + + invokeTest(defaultCsvReader(), CHAR_INPUT, expected); + } + + @Test + public void forbiddenNullChars() throws CsvReaderException { + final String input = "" + + "Values\n" + + "A\n" + + Sentinels.NULL_CHAR + "\n"; + + // NULL_CHAR can't be parsed as char; will be promoted to String. + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Values", "A", "" + Sentinels.NULL_CHAR)); + + invokeTest(csvReaderWithNulls(), input, expected); + } + + private static final String BYTE_INPUT = "" + + "Values\n" + + "-127\n" + + "\n" + + "127\n"; + + @Test + public void byteViaInference() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", (byte) (Byte.MIN_VALUE + 1), Sentinels.NULL_BYTE, Byte.MAX_VALUE)); + + invokeTest(csvReaderWithNulls().setParsers(Parsers.COMPLETE), BYTE_INPUT, expected); + } + + @Test(expected = CsvReaderException.class) + public void unconfiguredNullByte() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", (byte) (Byte.MIN_VALUE + 1), Sentinels.NULL_BYTE, Byte.MAX_VALUE)); + + invokeTest(defaultCsvReader(), BYTE_INPUT, expected); + } + + @Test + public void forbiddenNullBytes() throws CsvReaderException { + final String input = "" + + "Values\n" + + "-127\n" + + Sentinels.NULL_BYTE + "\n" + + "127\n"; + // NULL_BYTE can't be parsed as char; will be promoted to short (because we're using + // the Parsers.COMPLETE set of parsers, and short is in Parsers.COMPLETE set). + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", (short) (Byte.MIN_VALUE + 1), Sentinels.NULL_BYTE, Byte.MAX_VALUE)); + + invokeTest(csvReaderWithNulls().setParsers(Parsers.COMPLETE), input, expected); + } + + @Test + public void byteIsInt() throws CsvReaderException { + // By default, byte will be parsed as int, because neither Parsers.BYTE nor Parsers.SHORT is in Parsers.DEFAULT + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", (Byte.MIN_VALUE + 1), Sentinels.NULL_INT, Byte.MAX_VALUE)); + + invokeTest(csvReaderWithNulls(), BYTE_INPUT, expected); + } + + private static final String SHORT_INPUT = "" + + "Values\n" + + "-32767\n" + + "\n" + + "32767\n"; + + @Test + public void shorts() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", (short) (Short.MIN_VALUE + 1), Sentinels.NULL_SHORT, Short.MAX_VALUE)); + + invokeTest(csvReaderWithNulls().setParsers(Parsers.COMPLETE), SHORT_INPUT, expected); + } + + @Test(expected = CsvReaderException.class) + public void unconfiguredNullShort() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", (short) (Short.MIN_VALUE + 1), Sentinels.NULL_SHORT, Short.MAX_VALUE)); + + invokeTest(defaultCsvReader(), SHORT_INPUT, expected); + } + + @Test + public void forbiddenNullShorts() throws CsvReaderException { + final String input = "" + + "Values\n" + + "-32767\n" + + Sentinels.NULL_SHORT + "\n" + + "32767\n"; + + // NULL_SHORT can't be parsed as short; will be promoted to int. + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", (int) (Short.MIN_VALUE + 1), Sentinels.NULL_SHORT, Short.MAX_VALUE)); + + invokeTest(csvReaderWithNulls().setParsers(Parsers.COMPLETE), input, expected); + } + + @Test + public void ints() throws CsvReaderException { + final String input = "" + + "Values\n" + + "-2147483647\n" + + "\n" + + "2147483647\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", Integer.MIN_VALUE + 1, Sentinels.NULL_INT, Integer.MAX_VALUE)); + + invokeTest(csvReaderWithNulls(), input, expected); + } + + @Test + public void forbiddenNullInts() throws CsvReaderException { + final String input = "" + + "Values\n" + + Sentinels.NULL_INT + "\n"; + + // NULL_INT can't be parsed as int; will be promoted to long. + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", (long) Sentinels.NULL_INT)); + + invokeTest(csvReaderWithNulls(), input, expected); + } + + private static final String LONG_INPUT = "" + + "Values\n" + + "-9223372036854775807\n" + + "\n" + + "9223372036854775807\n"; + + @Test + public void longs() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", Long.MIN_VALUE + 1, Sentinels.NULL_LONG, Long.MAX_VALUE)); + + invokeTest(csvReaderWithNulls(), LONG_INPUT, expected); + } + + @Test + public void forbiddenNullLongs() throws CsvReaderException { + final String input = "" + + "Values\n" + + Sentinels.NULL_LONG + "\n"; + + // NULL_LONG can't be parsed as long; will be promoted to double. + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", (double) Sentinels.NULL_LONG)); + + invokeTest(csvReaderWithNulls(), input, expected); + } + + @Test + public void longAsStringsViaInference() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Values", "-9223372036854775807", null, "9223372036854775807")); + + invokeTest(csvReaderWithNulls().setParsers(List.of(Parsers.STRING)), LONG_INPUT, expected); + } + + @Test + public void longAsStringsViaParser() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Values", "-9223372036854775807", null, "9223372036854775807")); + + invokeTest(csvReaderWithNulls().setParserFor("Values", Parsers.STRING), LONG_INPUT, expected); + } + + private static final String FLOAT_INPUT = "" + + "Values\n" + + "Infinity\n" + + "\n" + + "-Infinity\n" + + "NaN\n" + + "3.4028234e+38\n" + + "1.17549435E-38\n" + + "1.4e-45\n"; + + @Test + public void floatIsDouble() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", + Float.POSITIVE_INFINITY, + Sentinels.NULL_DOUBLE, + Float.NEGATIVE_INFINITY, + Float.NaN, + 3.4028234e+38d, + 1.17549435E-38d, + 1.4e-45d)); + + invokeTest(csvReaderWithNulls(), FLOAT_INPUT, expected); + } + + @Test + public void floatViaInference() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", + Float.POSITIVE_INFINITY, + Sentinels.NULL_FLOAT, + Float.NEGATIVE_INFINITY, + Float.NaN, + Float.MAX_VALUE, + Float.MIN_NORMAL, + Float.MIN_VALUE)); + + invokeTest(csvReaderWithNulls().setParsers(List.of(Parsers.FLOAT)), FLOAT_INPUT, expected); + } + + @Test + public void forbiddenNullFloats() throws CsvReaderException { + final String input = "" + + "Values\n" + + Sentinels.NULL_FLOAT + "\n"; + + // I wanted to say simply (double)Sentinels.NULL_FLOAT, but that's a different number from + // the below (alas). + final double nullFloatAsParsedByDouble = Double.parseDouble("" + Sentinels.NULL_FLOAT); + + // NULL_FLOAT can't be parsed as float; will be promoted to double. + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", nullFloatAsParsedByDouble)); + + invokeTest(csvReaderWithNulls().setParsers(Parsers.COMPLETE), input, expected); + } + + @Test + public void significantFiguresChooseFloat() throws CsvReaderException { + final String input = "" + + "Values\n" + + "123.4567\n" + + "234.5678\n" + + "345.6789e10\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", + 123.4567f, + 234.5678f, + 345.6789e10f)); + + invokeTest(csvReaderWithNulls().addParsers(Parsers.FLOAT, Parsers.DOUBLE), input, expected); + } + + @Test + public void significantFiguresChooseDouble() throws CsvReaderException { + final String input = "" + + "Values\n" + + "9123.4567\n" + + "9234.5678\n" + + "9345.6789e10\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", + 9123.4567d, + 9234.5678d, + 9345.6789e10d)); + + invokeTest(csvReaderWithNulls().addParsers(Parsers.FLOAT, Parsers.DOUBLE), input, expected); + } + + @Test + public void doubleRange() throws CsvReaderException { + final String input = "" + + "Values\n" + + "Infinity\n" + + "\n" + + "-Infinity\n" + + "NaN\n" + + "1.7976931348623157e+308\n" + + "2.2250738585072014E-308\n" + + "4.9e-324\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", + Double.POSITIVE_INFINITY, + Sentinels.NULL_DOUBLE, + Double.NEGATIVE_INFINITY, + Double.NaN, + Double.MAX_VALUE, + Double.MIN_NORMAL, + Double.MIN_VALUE)); + + invokeTest(csvReaderWithNulls(), input, expected); + } + + @Test + public void forbiddenNullDoubles() throws CsvReaderException { + final String input = "" + + "Values\n" + + Sentinels.NULL_DOUBLE + "\n"; + + // NULL_DOUBLE can't be parsed as double; will be promoted to String + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Values", Sentinels.NULL_DOUBLE + "")); + + invokeTest(csvReaderWithNulls().setParsers(Parsers.COMPLETE), input, expected); + } + + @Test + public void strings() throws CsvReaderException { + final String input = "" + + "Values\n" + + "\"Hello, world\"\n" + + "\n" + // the empty string is null + "Goodbye.\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Values", + "Hello, world", + null, + "Goodbye.")); + + invokeTest(csvReaderWithNulls(), input, expected); + } + + @Test + public void quotingSuccessfulEdgeCases() throws CsvReaderException { + final String input = "" + + "Values\n" + + "##\n" + // the empty string, which is configured below to give us NULL + "####\n" + // # + "######\n"; // ## + + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Values", + null, + "#", + "##")); + + invokeTest(csvReaderWithNulls().setquoteChar('#'), input, expected); + } + + @Test(expected = CsvReaderException.class) + public void quotingFailingEdgeCases() throws CsvReaderException { + final String input = "" + + "Values\n" + + "###\n"; // invalid + + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("dummy")); + + invokeTest(csvReaderWithNulls().setquoteChar('#'), input, expected); + } + + @Test(expected = CsvReaderException.class) + public void quotingExcessMaterial() throws CsvReaderException { + final String input = "" + + "Val1,Val2\n" + + "#hello#junk,there\n"; // invalid + + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("dummy")); + + invokeTest(csvReaderWithNulls().setquoteChar('#'), input, expected); + } + + @Test(expected = CsvReaderException.class) + public void stringWithNullLiteralSetButValueUnset() throws CsvReaderException { + // It should fail when the null literal is set to something special, but the null String value is not set. + final String input = "" + + "Values\n" + + "hello\n" + + "NULL\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Values", "hello", null)); + + invokeTest(new CsvReader() + .setNullValueLiteral("NULL"), input, expected); + } + + @Test + public void stringWithNullLiteralSetAndValueNull() throws CsvReaderException { + // It should work when the null literal is set to something special, but the null String value is the null + // reference. + final String input = "" + + "Values\n" + + "hello\n" + + "NULL\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Values", "hello", null)); + + invokeTest(new CsvReader() + .setNullValueLiteral("NULL") + .setNullStringValue(null), input, expected); + } + + @Test + public void stringWithNullLiteralSetAndValueSet() throws CsvReaderException { + // It should work when the null literal is set to something special, and the null String value is set to the + // same + // thing. + final String input = "" + + "Values\n" + + "hello\n" + + "NULL\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Values", "hello", "NULL")); + + invokeTest(new CsvReader() + .setNullValueLiteral("NULL") + .setNullStringValue("NULL"), input, expected); + } + + @Test + public void stringWithNullLiteralSetAndValueSetDifferently() throws CsvReaderException { + // It should work when the null literal is set to something special, and the null String value is set to + // something + // different. + final String input = "" + + "Values\n" + + "hello\n" + + "NULL\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Values", "hello", "@@internal-NULL@@")); + + invokeTest(new CsvReader() + .setNullValueLiteral("NULL") + .setNullStringValue("@@internal-NULL@@"), input, expected); + } + + @Test + public void unconfiguredNullStringsAreEmpty() throws CsvReaderException { + // As a very special case, in order to be nice to our users, if: + // 1. the null literal is configured to be the empty string (which it is by default), + // 2. and the input contains the empty string (which our logic would normally interpret as the 'null' of the + // target type) + // 3. and the column is being parsed as String, either explicitly or due to inference, + // 4. but the user has not configured a null sentinel value for String + // + // ...then we will parse that value as an empty string rather than throwing an exception, as we would do + // for any other data type. + final String input = "" + + "Values\n" + + "\"Hello, world\"\n" + + "\n" + + "Goodbye.\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Values", + "Hello, world", + "", + "Goodbye.")); + + invokeTest(new CsvReader(), input, expected); + } + + @Test + public void stringsPound() throws CsvReaderException { + final String input = "" + + "Values\n" + + "#Hello, world#\n" + + "\n" + + "Goodbye.\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Values", + "Hello, world", + null, + "Goodbye.")); + + invokeTest(csvReaderWithNulls().setquoteChar('#'), input, expected); + } + + + @Test + public void newlineDiversity() throws CsvReaderException { + final String input = "" + + "Values\r" + + "-2147483647\r\n" + + "\n" + + "2147483647\r\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", Integer.MIN_VALUE + 1, Sentinels.NULL_INT, Integer.MAX_VALUE)); + + invokeTest(csvReaderWithNulls(), input, expected); + } + + private static final String LANGUAGE_EXAMPLE_HEADERLESS_INPUT = "" + + "C,Dennis Ritchie,Compiled\n" + + "C++,Bjarne Stroustrup,Compiled\n" + + "Fortran,John Backus,Compiled\n" + + "Java,James Gosling,Both\n" + + "JavaScript,Brendan Eich,Interpreted\n" + + "MATLAB,Cleve Moler,Interpreted\n" + + "Pascal,Niklaus Wirth,Compiled\n" + + "Python,Guido van Rossum,Interpreted\n"; + + private static final String LANGUAGE_EXAMPLE_INPUT = "" + + "Language,Creator,Type\n" + + LANGUAGE_EXAMPLE_HEADERLESS_INPUT; + + private static final String LANGUAGE_EXAMPLE_TSV = "" + + "Language\tCreator\tType\n" + + "C\tDennis Ritchie\tCompiled\n" + + "C++\tBjarne Stroustrup\tCompiled\n" + + "Fortran\tJohn Backus\tCompiled\n" + + "Java\tJames Gosling\tBoth\n" + + "JavaScript\tBrendan Eich\tInterpreted\n" + + "MATLAB\tCleve Moler\tInterpreted\n" + + "Pascal\tNiklaus Wirth\tCompiled\n" + + "Python\tGuido van Rossum\tInterpreted\n"; + + @Test + public void languageExample() throws CsvReaderException { + invokeTest(csvReaderWithNulls(), LANGUAGE_EXAMPLE_INPUT, languageCreatorTypeTable()); + } + + @Test + public void languageExampleTsv() throws CsvReaderException { + invokeTest(csvReaderWithNulls().setFieldDelimiter('\t'), LANGUAGE_EXAMPLE_TSV, languageCreatorTypeTable()); + } + + @Test + public void languageExampleHeaderless() throws CsvReaderException { + invokeTest(csvReaderWithNulls().setHasHeaders(false), LANGUAGE_EXAMPLE_HEADERLESS_INPUT, + languageCreatorTypeTableHeaderless()); + } + + @Test + public void languageExampleHeaderlessExplicit() throws CsvReaderException { + final ColumnSet expected = languageCreatorTypeTable(); + invokeTest(csvReaderWithNulls() + .setHasHeaders(false) + .setHeaders(List.of("Language", "Creator", "Type")), + LANGUAGE_EXAMPLE_HEADERLESS_INPUT, expected); + } + + private static ColumnSet languageCreatorTypeTable() { + return populateLanguageExample("Language", "Creator", "Type"); + } + + private static ColumnSet languageCreatorTypeTableHeaderless() { + return populateLanguageExample("Column1", "Column2", "Column3"); + } + + private static ColumnSet populateLanguageExample(final String col1, final String col2, final String col3) { + return ColumnSet.of( + Column.ofRefs(col1, "C", "C++", "Fortran", "Java", + "JavaScript", "MATLAB", "Pascal", "Python"), + Column.ofRefs(col2, "Dennis Ritchie", "Bjarne Stroustrup", "John Backus", "James Gosling", + "Brendan Eich", "Cleve Moler", "Niklaus Wirth", "Guido van Rossum"), + Column.ofRefs(col3, "Compiled", "Compiled", "Compiled", "Both", + "Interpreted", "Interpreted", "Compiled", "Interpreted")); + } + + private static final String WHITESPACE_NO_QUOTES = "" + + "Sym,Type,Price,SecurityId\n" + + "GOOG, Dividend, 0.25, 200\n" + + "T, Dividend, 0.15, 300\n" + + " Z, Dividend, 0.18, 500\n"; + + @Test + public void whitespaceNoQuotes() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Sym", "GOOG", "T", "Z"), + Column.ofRefs("Type", "Dividend", "Dividend", "Dividend"), + Column.ofValues("Price", 0.25, 0.15, 0.18), + Column.ofValues("SecurityId", 200, 300, 500)); + + invokeTest(csvReaderWithNulls(), WHITESPACE_NO_QUOTES, expected); + } + + @Test + public void whitespaceNoQuotesLiteral() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Sym", "GOOG", "T", " Z"), + Column.ofRefs("Type", " Dividend", " Dividend", " Dividend"), + Column.ofValues("Price", 0.25, 0.15, 0.18), + Column.ofValues("SecurityId", 200, 300, 500)); + + invokeTest(csvReaderWithNulls().setIgnoreSurroundingSpaces(false), WHITESPACE_NO_QUOTES, expected); + } + + @Test + public void whitespaceOutside() throws CsvReaderException { + // Use vertical bars instead of quotation marks to make things more readable for the humans looking at this. + final String input = ("" + + "Sym,Type,Price,SecurityId\n" + + "|GOOG|, |Dividend|, |0.25|, |200|\n" + + "|T|, |Dividend|, |0.15|, |300|\n" + + " |Z|, |Dividend|, |0.18|, |500|\n"); + + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Sym", "GOOG", "T", "Z"), + Column.ofRefs("Type", "Dividend", "Dividend", "Dividend"), + Column.ofValues("Price", 0.25, 0.15, 0.18), + Column.ofValues("SecurityId", 200, 300, 500)); + + invokeTest(csvReaderWithNulls().setquoteChar('|'), input, expected); + } + + // Use vertical bars instead of quotation marks to make things more readable for the humans looking at this. + private static final String WHITESPACE_INSIDE = "" + + "Sym,Type,Price,SecurityId\n" + + "|GOOG|,| Dividend|,| 0.25|,| 200|\n" + + "|T|,|Dividend |,| 0.15|,| 300|\n" + + "| Z|,| Dividend |,| 0.18|,| 500|\n"; + + @Test + public void whitespaceInsideDefault() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Sym", "GOOG", "T", " Z"), + Column.ofRefs("Type", " Dividend", "Dividend ", " Dividend "), + Column.ofValues("Price", 0.25, 0.15, 0.18), + Column.ofValues("SecurityId", 200, 300, 500)); + invokeTest(csvReaderWithNulls().setquoteChar('|'), WHITESPACE_INSIDE, expected); + } + + @Test + public void whitespaceInsideTrim() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Sym", "GOOG", "T", "Z"), + Column.ofRefs("Type", "Dividend", "Dividend", "Dividend"), + Column.ofValues("Price", 0.25, 0.15, 0.18), + Column.ofValues("SecurityId", 200, 300, 500)); + + invokeTest(csvReaderWithNulls().setquoteChar('|').setTrim(true), WHITESPACE_INSIDE, expected); + } + + private static final String WHITESPACE_INSIDE_AND_OUTSIDE = "" + + "Sym,Type,Price,SecurityId\n" + + "|GOOG|, | Dividend|, | 0.25|, | 200|\n" + + "|T|, | Dividend|, | 0.15|, | 300|\n" + + "| Z|, | Dividend|, | 0.18|, | 500|\n"; + + @Test + public void whitespaceInsideAndOutsideDefault() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Sym", "GOOG", "T", " Z"), + Column.ofRefs("Type", " Dividend", " Dividend", " Dividend"), + Column.ofValues("Price", 0.25, 0.15, 0.18), + Column.ofValues("SecurityId", 200, 300, 500)); + + invokeTest(csvReaderWithNulls().setquoteChar('|'), WHITESPACE_INSIDE_AND_OUTSIDE, expected); + } + + @Test + public void whitespaceInsideAndOutsideTrim() throws CsvReaderException { + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Sym", "GOOG", "T", "Z"), + Column.ofRefs("Type", "Dividend", "Dividend", "Dividend"), + Column.ofValues("Price", 0.25, 0.15, 0.18), + Column.ofValues("SecurityId", 200, 300, 500)); + + invokeTest(csvReaderWithNulls().setquoteChar('|').setTrim(true), WHITESPACE_INSIDE_AND_OUTSIDE, expected); + } + + @Test + public void noTrailingNewline() throws CsvReaderException { + // Sometimes there is no trailing newline. That's OK. + final String input = "" + + "SomeInts,SomeStrings\n" + + "-3,foo\n" + + "4,bar\n" + + "-5,baz"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("SomeInts", -3, 4, -5), + Column.ofRefs("SomeStrings", "foo", "bar", "baz")); + + invokeTest(csvReaderWithNulls(), input, expected); + } + + @Test(expected = CsvReaderException.class) + public void tooFewColumnsWithFinalNewline() throws CsvReaderException { + // Too few columns is an error. + final String input = "" + + "SomeInts,SomeStrings\n" + + "-3,foo\n" + + "4,bar\n" + + "-5\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("SomeInts", -3, 4, -5), + Column.ofRefs("SomeStrings", "foo", "bar", "baz")); + + invokeTest(csvReaderWithNulls(), input, expected); + } + + @Test(expected = CsvReaderException.class) + public void tooFewColumnsWithoutFinalNewline() throws CsvReaderException { + // Too few columns is an error. + final String input = "" + + "SomeInts,SomeStrings\n" + + "-3,foo\n" + + "4,bar\n" + + "-5"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("SomeInts", -3, 4, -5), + Column.ofRefs("SomeStrings", "foo", "bar", "baz")); + + invokeTest(csvReaderWithNulls(), input, expected); + } + + @Test(expected = CsvReaderException.class) + public void tooManyColumns() throws CsvReaderException { + // Too many columns is an error. + final String input = "" + + "SomeInts,SomeStrings\n" + + "-3,foo\n" + + "4,bar,quz\n" + + "-5,baz\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("SomeInts", -3, 4, -5), + Column.ofRefs("SomeStrings", "foo", "bar", "baz")); + + invokeTest(csvReaderWithNulls(), input, expected); + } + + @Test + public void dateTimes() throws CsvReaderException { + final String input = "" + + "Values\n" + + "2021-09-27T19:00:00Z\n" + + "\n" + + "2021-09-27T20:00:00Z\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", 1632769200000000000L, Sentinels.NULL_LONG, 1632772800000000000L)); + + invokeTest(csvReaderWithNulls(), input, expected); + } + + @Test + public void dateTimeFormats() throws CsvReaderException { + final String input = "" + + "Values\n" + + "20210927T19Z\n" + + "20210927 19Z\n" + + "20210927T1934Z\n" + + "20210927T193458Z\n" + + "20210927T193458.123Z\n" + + "20210927T193458.123456Z\n" + + "20210927T193458.123456789Z\n" + + "20210927T193458.123456789+0200\n" + + "20210927T193458.123456789-0330\n" + + + "2021-09-27T19Z\n" + + "2021-09-27 19Z\n" + + "2021-09-27T19:34Z\n" + + "2021-09-27T19:34:58Z\n" + + "2021-09-27T19:34:58.123Z\n" + + "2021-09-27T19:34:58.123456Z\n" + + "2021-09-27T19:34:58.123456789Z\n" + + "2021-09-27T19:34:58.123456789+0200\n" + + "2021-09-27T19:34:58.123456789-0330\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", + 1632769200000000000L, + 1632769200000000000L, + 1632771240000000000L, + 1632771298000000000L, + 1632771298123000000L, + 1632771298123456000L, + 1632771298123456789L, + 1632764098123456789L, + 1632783898123456789L, + + 1632769200000000000L, + 1632769200000000000L, + 1632771240000000000L, + 1632771298000000000L, + 1632771298123000000L, + 1632771298123456000L, + 1632771298123456789L, + 1632764098123456789L, + 1632783898123456789L)); + + invokeTest(csvReaderWithNulls(), input, expected); + } + + + @Test + public void timestampSeconds() throws CsvReaderException { + final String input = "" + + "Values\n" + + "1632769200\n" + + "\n" + + "1632772800\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", 1632769200000000000L, Sentinels.NULL_TIMESTAMP_AS_LONG, + 1632772800000000000L)); + + invokeTest(csvReaderWithNulls().setParsers(List.of(Parsers.TIMESTAMP_SECONDS)), input, expected); + } + + @Test + public void timestampMillis() throws CsvReaderException { + final String input = "" + + "Values\n" + + "1632769200000\n" + + "\n" + + "1632772800000\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", 1632769200000000000L, Sentinels.NULL_TIMESTAMP_AS_LONG, + 1632772800000000000L)); + + invokeTest(csvReaderWithNulls().setParsers(List.of(Parsers.TIMESTAMP_MILLIS)), input, expected); + } + + @Test + public void timestampMicros() throws CsvReaderException { + final String input = "" + + "Values\n" + + "1632769200000000\n" + + "\n" + + "1632772800000000\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", 1632769200000000000L, Sentinels.NULL_TIMESTAMP_AS_LONG, + 1632772800000000000L)); + + invokeTest(csvReaderWithNulls().setParsers(List.of(Parsers.TIMESTAMP_MICROS)), input, expected); + } + + @Test + public void timestampNanos() throws CsvReaderException { + final String input = "" + + "Values\n" + + "1632769200000000000\n" + + "\n" + + "1632772800000000000\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", 1632769200000000000L, Sentinels.NULL_TIMESTAMP_AS_LONG, + 1632772800000000000L)); + + invokeTest(csvReaderWithNulls().setParsers(List.of(Parsers.TIMESTAMP_NANOS)), input, expected); + } + + @Test + public void dateTimeCustomizedTimezone() throws CsvReaderException { + final String input = "" + + "Values\n" + + "2021-09-27T19:00:00 UTC\n" + + "\n" + + "2021-09-27T20:00:00 UTC\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", 1632769200000000000L, Sentinels.NULL_LONG, 1632772800000000000L)); + + // Simple custom time zone parser that only understands " UTC" + Tokenizer.CustomTimeZoneParser myTimeZoneParser = (bs, tzo, off) -> { + if (bs.size() < 4) { + return false; + } + final byte[] d = bs.data(); + final int o = bs.begin(); + if (d[o] == ' ' && d[o + 1] == 'U' && d[o + 2] == 'T' && d[o + 3] == 'C') { + tzo.setValue(ZoneOffset.UTC); + off.setValue(0); + bs.setBegin(bs.begin() + 4); + return true; + } + return false; + }; + + invokeTest(csvReaderWithNulls() + .setCustomTimeZoneParser(myTimeZoneParser), + input, expected); + } + + private static final String ALL_NULLS = "" + + "Values\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n"; + + @Test(expected = CsvReaderException.class) + public void unparseable() throws CsvReaderException { + final String input = "" + + "Values\n" + + "hello\n" + + "there\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Values", "hello", "there")); + + invokeTest(csvReaderWithNulls() + .setParsers(List.of(Parsers.INT, Parsers.LONG, Parsers.DATETIME)), input, expected); + } + + @Test(expected = CsvReaderException.class) + public void noParsers() throws CsvReaderException { + final String input = "" + + "Values\n" + + "hello\n" + + "there\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Values", "hello", "there")); + + invokeTest(csvReaderWithNulls().setParsers(List.of()), input, expected); + } + + @Test + public void allNullsWithSpecifiedParser() throws CsvReaderException { + final long nv = Sentinels.NULL_LONG; + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", nv, nv, nv, nv, nv)); + + invokeTest(csvReaderWithNulls().setParserFor("Values", Parsers.LONG), ALL_NULLS, expected); + } + + @Test + public void allNullsWithNullParser() throws CsvReaderException { + final long nv = Sentinels.NULL_LONG; + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Values", nv, nv, nv, nv, nv)); + + invokeTest(csvReaderWithNulls().setNullParser(Parsers.LONG), ALL_NULLS, expected); + } + + @Test(expected = CsvReaderException.class) + public void allNullsButNoParser() throws CsvReaderException { + final long nv = Sentinels.NULL_LONG; + final ColumnSet expected = ColumnSet.of( + Column.ofValues("SomeLongs", nv, nv, nv, nv, nv)); + + invokeTest(csvReaderWithNulls(), ALL_NULLS, expected); + } + + @Test + public void unicode() throws CsvReaderException { + final String input = "" + + "Emojis\n" + + "Hello 💖\n" + + "Regular ASCII\n" + + "✨ Deephaven ✨\n" + + "🎆🎆🎆🎆🎆\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("Emojis", "Hello 💖", "Regular ASCII", "✨ Deephaven ✨", "🎆🎆🎆🎆🎆")); + + invokeTest(csvReaderWithNulls(), input, expected); + } + + @Test + public void largeCells() throws CsvReaderException { + final StringBuilder sbBytes = new StringBuilder(); + final StringBuilder sbChars = new StringBuilder(); + final StringBuilder sbQuotesEscaped = new StringBuilder(); + final StringBuilder sbQuotesLiteral = new StringBuilder(); + for (int ii = 0; ii < 1000; ++ii) { + sbBytes.append("Deephaven!"); + sbChars.append("🍣Deep🍔haven!🍕"); + sbQuotesEscaped.append("Deep\"\"haven!"); + sbQuotesLiteral.append("Deep\"haven!"); + } + final String largeCellBytes = sbBytes.toString(); + final String largeCellChars = sbChars.toString(); + final String largeCellEscaped = '"' + sbQuotesEscaped.toString() + '"'; + final String largeCellLiteral = sbQuotesLiteral.toString(); + + final String input = "" + + "LargeEmojis\n" + + largeCellBytes + "\n" + + largeCellChars + "\n" + + largeCellEscaped + "\n" + + largeCellBytes + "\n" + + largeCellChars + "\n" + + largeCellEscaped + "\n"; + + System.out.println(input); + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("LargeEmojis", largeCellBytes, largeCellChars, largeCellLiteral, + largeCellBytes, largeCellChars, largeCellLiteral)); + + invokeTest(csvReaderWithNulls(), input, expected); + } + + @Test + public void customGlobalNullValue() throws CsvReaderException { + final String input = "" + + "SomeBytes,SomeShorts,SomeInts,SomeLongs\n" + + "1,2,3,4\n" + + "NULL,NULL,NULL,NULL\n" + + "100,32000,2000000000,4000000000\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("SomeBytes", (byte) 1, Sentinels.NULL_BYTE, (byte) 100), + Column.ofValues("SomeShorts", (short) 2, Sentinels.NULL_SHORT, (short) 32000), + Column.ofValues("SomeInts", 3, Sentinels.NULL_INT, 2000000000), + Column.ofValues("SomeLongs", 4L, Sentinels.NULL_LONG, 4000000000L)); + + invokeTest(csvReaderWithNulls().setParsers(Parsers.COMPLETE).setNullValueLiteral("NULL"), input, expected); + } + + + /** + * Test column-specific NULL literals values which may be specified by index and also as Unicode + */ + @Test + public void customColumnSpecificNullValue() throws CsvReaderException { + final String input = "" + + "SomeBytes,SomeShorts,SomeInts,SomeLongs\n" + + "1,2,3,4\n" + + "❌,🔥,⋰⋱,𝓓𝓮𝓮𝓹𝓱𝓪𝓿𝓮𝓷\n" + + "100,32000,2000000000,4000000000\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("SomeBytes", (byte) 1, Sentinels.NULL_BYTE, (byte) 100), + Column.ofValues("SomeShorts", (short) 2, Sentinels.NULL_SHORT, (short) 32000), + Column.ofValues("SomeInts", 3, Sentinels.NULL_INT, 2000000000), + Column.ofValues("SomeLongs", 4L, Sentinels.NULL_LONG, 4000000000L)); + + invokeTest(csvReaderWithNulls() + .setParsers(Parsers.COMPLETE) + .setNullValueLiteralFor(1, "❌") + .setNullValueLiteralFor(2, "🔥") + .setNullValueLiteralFor("SomeInts", "⋰⋱") + .setNullValueLiteralFor("SomeLongs", "𝓓𝓮𝓮𝓹𝓱𝓪𝓿𝓮𝓷"), + input, expected); + } + + /** + * Provide a number of rows larger than ParserBase.DEST_BLOCK_SIZE. + */ + @Test + public void manyRows() throws CsvReaderException { + final StringBuilder sb = new StringBuilder(); + sb.append( + "SomeBooleans,SomeBytes,SomeShorts,SomeInts,SomeLongs,SomeFloats,SomeDoubles,SomeStrings,SomeChars,SomeDateTimes,SomeTimestamps\n"); + final TByteArrayList booleans = new TByteArrayList(); + final TByteArrayList bytes = new TByteArrayList(); + final TShortArrayList shorts = new TShortArrayList(); + final TIntArrayList ints = new TIntArrayList(); + final TLongArrayList longs = new TLongArrayList(); + final TFloatArrayList floats = new TFloatArrayList(); + final TDoubleArrayList doubles = new TDoubleArrayList(); + final ArrayList strings = new ArrayList<>(); + final TCharArrayList chars = new TCharArrayList(); + final TLongArrayList dateTimesAsLongs = new TLongArrayList(); + final TLongArrayList timestampsAsLongs = new TLongArrayList(); + final String qq = "qq"; + final long dtl = 799402088000000000L; // 1995-05-02 08:08:08Z + final long tsl = 3456789012L; + // Make sure we have a few more rows than Parser.DEST_BLOCK_SIZE + for (int ii = 0; ii < Parser.DEST_BLOCK_SIZE + 3; ++ii) { + sb.append("true,5,6,7,8,9.9,1.1,qq,r,1995-05-02 08:08:08Z,3456789012\n"); + booleans.add((byte) 1); + bytes.add((byte) 5); + shorts.add((short) 6); + ints.add(7); + longs.add(8); + floats.add(9.9f); + doubles.add(1.1); + strings.add(qq); + chars.add('r'); + dateTimesAsLongs.add(dtl); + timestampsAsLongs.add(tsl); + } + // Add a row like this somewhere (let's put it at the end to make things challenging) so inference picks the + // right types. + sb.append("false,100,32000,2000000000,4000000000,5.5,6.6e50,yy,z,2020-03-05 12:34:56Z,123456789\n"); + booleans.add((byte) 0); + bytes.add((byte) 100); + shorts.add((short) 32000); + ints.add(2000000000); + longs.add(4000000000L); + floats.add(5.5f); + doubles.add(6.6e50); + strings.add("yy"); + chars.add('z'); + dateTimesAsLongs.add(1583411696000000000L); // 2020-03-05 12:34:56Z + timestampsAsLongs.add(123456789); + + final String input = sb.toString(); + final ColumnSet expected = ColumnSet.of( + Column.ofArray("SomeBooleans", booleans.toArray()), + Column.ofArray("SomeBytes", bytes.toArray()), + Column.ofArray("SomeShorts", shorts.toArray()), + Column.ofArray("SomeInts", ints.toArray()), + Column.ofArray("SomeLongs", longs.toArray()), + Column.ofArray("SomeFloats", floats.toArray()), + Column.ofArray("SomeDoubles", doubles.toArray()), + Column.ofArray("SomeStrings", strings.toArray(new String[0])), + Column.ofArray("SomeChars", chars.toArray()), + Column.ofArray("SomeDateTimes", dateTimesAsLongs.toArray()), + Column.ofArray("SomeTimestamps", timestampsAsLongs.toArray())); + invokeTest(csvReaderWithNulls() + .setParsers(Parsers.COMPLETE) + .setParserFor("SomeTimestamps", Parsers.TIMESTAMP_NANOS), + input, expected); + } + + @Test + public void customParser() throws CsvReaderException { + final String bd1 = + "81290897538197389132106321892137218932178913227138932178912312132.21879213278912387692138723198"; + final String bd2 = + "-9210381027382193791312718239712389127812931236183167913268912683921681293621891236821.12986178632478123678312762318"; + + final String input = "" + + "Index,BigValues\n" + + "0," + bd1 + "\n" + + "1,\n" + + "2," + bd2 + "\n"; + + final ColumnSet expected = ColumnSet.of( + Column.ofValues("Index", 0, 1, 2), + Column.ofRefs("BigValues", new BigDecimal(bd1), null, new BigDecimal(bd2))); + + invokeTest(csvReaderWithNulls() + .setParserFor(2, new MyBigDecimalParser()), + input, expected); + } + + private static class MyBigDecimalParser implements Parser { + @Override + public Sink tryParse(ParseContext ctx, IteratorHolder ih, Sink sink, long current, + long end) throws CsvReaderException { + final BigDecimal[] chunk = new BigDecimal[DEST_BLOCK_SIZE]; + // Need a character buffer for the BigDecimal constructor. Grows as needed. + char[] charData = new char[0]; + + final boolean appending = sink == null; + if (appending) { + sink = new MyBigDecimalSink(); + } + + int chunkIndex = 0; + do { + if (chunkIndex == chunk.length) { + sink.write(chunk, 0, current, chunkIndex, appending); + current += chunkIndex; + chunkIndex = 0; + } + if (current + chunkIndex == end) { + break; + } + if (ctx.isNullCell(ih)) { + chunk[chunkIndex++] = null; + continue; + } + if (!ih.hasBytes()) { + return null; + } + final ByteSlice bs = ih.bs(); + + // Convert bytes to chars. Annoying. + if (charData.length < bs.size()) { + charData = new char[bs.size()]; + } + int destIndex = 0; + for (int cur = bs.begin(); cur != bs.end(); ++cur) { + charData[destIndex++] = (char) bs.data()[cur]; + } + + try { + chunk[chunkIndex++] = new BigDecimal(charData, 0, destIndex); + } catch (NumberFormatException ne) { + return null; + } + } while (ih.tryMoveNext()); + sink.write(chunk, 0, current, chunkIndex, appending); + return sink; + } + } + + private static class MyBigDecimalSink implements Sink, ColumnProvider { + private final List dest = new ArrayList<>(); + + @Override + public final void write(final BigDecimal[] src, final int srcOffset, final long destOffset, final int size, + final boolean appending) { + if (size == 0) { + return; + } + + if (appending) { + // If the new area starts beyond the end of the destination, pad the destination. + while (dest.size() < destOffset) { + dest.add(null); + } + for (int ii = 0; ii < size; ++ii) { + dest.add(src[srcOffset + ii]); + } + return; + } + + final int destOffsetAsInt = Math.toIntExact(destOffset); + for (int ii = 0; ii < size; ++ii) { + dest.set(destOffsetAsInt + ii, src[srcOffset + ii]); + } + } + + @Override + public Column toColumn(final String columnName) { + return Column.ofArray(columnName, dest.toArray(new BigDecimal[0])); + } + } + + private static final class ColumnSet { + private final Column[] columns; + private final int columnSize; + + public static ColumnSet of(Column... columns) { + if (columns.length == 0) { + throw new RuntimeException("Empty column set is not permitted"); + } + final int c0Size = columns[0].size(); + for (int ii = 1; ii < columns.length; ++ii) { // Deliberately starting at 1. + final int ciiSize = columns[ii].size(); + if (ciiSize != c0Size) { + throw new RuntimeException( + String.format("Column %d (size %d) has a different size than column 0 (size %d)", + ii, ciiSize, c0Size)); + } + } + return new ColumnSet(columns, c0Size); + } + + private ColumnSet(Column[] columns, int columnSize) { + this.columns = columns; + this.columnSize = columnSize; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + final List> colList = List.of(columns); + + Renderer.renderList(sb, colList, ",", + col -> String.format("%s(%s)", col.name(), col.elementType().getCanonicalName())); + for (int jj = 0; jj < columnSize; ++jj) { + final int jjFinal = jj; + sb.append('\n'); + Renderer.renderList(sb, colList, ",", col -> safeToString(col.getItem(jjFinal))); + } + return sb.toString(); + } + + private static String safeToString(Object o) { + return o == null ? "(null)" : o.toString(); + } + } + + private static final class Column { + private final String name; + private final TARRAY values; + private final int size; + + public static Column ofValues(final String name, final byte... values) { + return ofArray(name, values); + } + + public static Column ofValues(final String name, final short... values) { + return ofArray(name, values); + } + + public static Column ofValues(final String name, final int... values) { + return ofArray(name, values); + } + + public static Column ofValues(final String name, final long... values) { + return ofArray(name, values); + } + + public static Column ofValues(final String name, final float... values) { + return ofArray(name, values); + } + + public static Column ofValues(final String name, final double... values) { + return ofArray(name, values); + } + + public static Column ofValues(final String name, final char... values) { + return ofArray(name, values); + } + + public static Column ofRefs(final String name, final T... values) { + return ofArray(name, values); + } + + public static Column ofArray(final String name, final TARRAY values) { + return new Column<>(name, values); + } + + private Column(String name, TARRAY values) { + this.name = name; + this.values = values; + this.size = Array.getLength(values); + } + + public int size() { + return size; + } + + public String name() { + return name; + } + + public Class elementType() { + return values.getClass().getComponentType(); + } + + public Object getItem(int index) { + return Array.get(values, index); + } + } + + private static CsvReader defaultCsvReader() { + return new CsvReader(); + } + + private static CsvReader csvReaderWithNulls() { + return new CsvReader() + .setIgnoreSurroundingSpaces(true) + .setNullBooleanAsByteValue(Sentinels.NULL_BOOLEAN_AS_BYTE) + .setNullByteValue(Sentinels.NULL_BYTE) + .setNullShortValue(Sentinels.NULL_SHORT) + .setNullIntValue(Sentinels.NULL_INT) + .setNullLongValue(Sentinels.NULL_LONG) + .setNullFloatValue(Sentinels.NULL_FLOAT) + .setNullDoubleValue(Sentinels.NULL_DOUBLE) + .setNullCharValue(Sentinels.NULL_CHAR) + .setNullStringValue(Sentinels.NULL_STRING) + .setNullDateTimeAsLongValue(Sentinels.NULL_DATETIME_AS_LONG) + .setNullTimestampAsLongValue(Sentinels.NULL_TIMESTAMP_AS_LONG); + } + + private static void invokeTest(CsvReader csvReader, String input, ColumnSet expected) throws CsvReaderException { + final ColumnSet actual; + try (final Reader reader = new StringReader(input)) { + actual = parse(csvReader, reader); + } catch (IOException inner) { + throw new CsvReaderException("Caught", inner); + } + final String expectedToString = expected.toString(); + final String actualToString = actual.toString(); + Assertions.assertThat(actualToString).isEqualTo(expectedToString); + } + + /** + * Parses {@code reader} according to the specifications of {@code this}. The {@code reader} will be closed upon + * return. + * + *

    + * Note: this implementation will buffer the {@code reader} internally. + * + * @param reader the reader + * @return the new table + * @throws CsvReaderException If any sort of failure occurs. + */ + private static ColumnSet parse(CsvReader csvReader, Reader reader) throws CsvReaderException { + final CsvReader.Result result = csvReader.read(reader, new MySinkFactory()); + + final int numCols = result.numCols(); + + final String[] columnNames = result.columnNames(); + final Sink[] sinks = result.columns(); + final Column[] columns = new Column[numCols]; + for (int ii = 0; ii < numCols; ++ii) { + final String columnName = columnNames[ii]; + final ColumnProvider sink = (ColumnProvider) sinks[ii]; + columns[ii] = sink.toColumn(columnName); + } + return ColumnSet.of(columns); + } + + public interface ColumnProvider { + Column toColumn(final String columnName); + } + + private static abstract class MySinkBase implements Sink, ColumnProvider { + private final TCOLLECTION dest; + private final ToIntFunction sizeGetter; + private final ObjIntConsumer padOperation; + private final CopyOperation copyOperation; + private final AppendOperation appendOperation; + private final BiFunction> toColumnOperation; + + protected MySinkBase(final TCOLLECTION dest, final ToIntFunction sizeGetter, + final ObjIntConsumer padOperation, + final CopyOperation copyOperation, + final AppendOperation appendOperation, + final BiFunction> toColumnOperation) { + this.dest = dest; + this.sizeGetter = sizeGetter; + this.padOperation = padOperation; + this.copyOperation = copyOperation; + this.appendOperation = appendOperation; + this.toColumnOperation = toColumnOperation; + } + + @Override + public final void write(final TARRAY src, final int srcOffset, final long destOffset, final int size, + boolean appending) { + if (size == 0) { + return; + } + + final int destBegin = Math.toIntExact(destOffset); + if (appending) { + // If the new area starts beyond the end of the destination, pad the destination. + int destSize = sizeGetter.applyAsInt(dest); + if (destBegin > destSize) { + padOperation.accept(dest, destBegin - destSize); + } + appendOperation.apply(dest, src, srcOffset, size); + return; + } + + copyOperation.apply(dest, destBegin, src, srcOffset, size); + } + + public final Column toColumn(final String columnName) { + return toColumnOperation.apply(dest, columnName); + } + + private interface CopyOperation { + void apply(TCOLLECTION dest, int destOffset, TARRAY src, int srcOffset, int size); + } + + private interface AppendOperation { + void apply(TCOLLECTION dest, TARRAY src, int srcOffset, int size); + } + } + + private static final class MyCharSink extends MySinkBase { + public MyCharSink() { + super(new TCharArrayList(), + TCharArrayList::size, + (dest, size) -> dest.fill(dest.size(), dest.size() + size, (char) 0), + TCharArrayList::set, + TCharArrayList::add, + (dest, name) -> Column.ofArray(name, dest.toArray())); + } + } + + private static class MyByteSinkBase extends MySinkBase { + public MyByteSinkBase() { + super(new TByteArrayList(), + TByteArrayList::size, + (dest, size) -> dest.fill(dest.size(), dest.size() + size, (byte) 0), + TByteArrayList::set, + TByteArrayList::add, + (dest, name) -> Column.ofArray(name, dest.toArray())); + } + } + + private static final class MyBooleanAsByteSink extends MyByteSinkBase { + } + + private static final class MyByteSink extends MyByteSinkBase { + } + + private static final class MyShortSink extends MySinkBase { + public MyShortSink() { + super(new TShortArrayList(), + TShortArrayList::size, + (dest, size) -> dest.fill(dest.size(), dest.size() + size, (short) 0), + TShortArrayList::set, + TShortArrayList::add, + (dest, name) -> Column.ofArray(name, dest.toArray())); + } + } + + private static final class MyIntSink extends MySinkBase { + public MyIntSink() { + super(new TIntArrayList(), + TIntArrayList::size, + (dest, size) -> dest.fill(dest.size(), dest.size() + size, 0), + TIntArrayList::set, + TIntArrayList::add, + (dest, name) -> Column.ofArray(name, dest.toArray())); + } + } + + private static class MyLongSinkBase extends MySinkBase { + public MyLongSinkBase() { + super(new TLongArrayList(), + TLongArrayList::size, + (dest, size) -> dest.fill(dest.size(), dest.size() + size, 0), + TLongArrayList::set, + TLongArrayList::add, + (dest, name) -> Column.ofArray(name, dest.toArray())); + } + } + private static final class MyLongSink extends MyLongSinkBase { + } + + private static final class MyFloatSink extends MySinkBase { + public MyFloatSink() { + super(new TFloatArrayList(), + TFloatArrayList::size, + (dest, size) -> dest.fill(dest.size(), dest.size() + size, 0), + TFloatArrayList::set, + TFloatArrayList::add, + (dest, name) -> Column.ofArray(name, dest.toArray())); + } + } + + private static final class MyDoubleSink extends MySinkBase { + public MyDoubleSink() { + super(new TDoubleArrayList(), + TDoubleArrayList::size, + (dest, size) -> dest.fill(dest.size(), dest.size() + size, 0), + TDoubleArrayList::set, + TDoubleArrayList::add, + (dest, name) -> Column.ofArray(name, dest.toArray())); + } + } + + private static final class MyStringSink extends MySinkBase, String[]> { + public MyStringSink() { + super(new ArrayList<>(), + ArrayList::size, + MyStringSink::pad, + MyStringSink::copy, + MyStringSink::append, + (dest, name) -> Column.ofArray(name, dest.toArray(new String[0]))); + } + + private static void pad(final ArrayList dest, int size) { + while (size-- != 0) { + dest.add(null); + } + } + + private static void copy(final ArrayList dest, final int destOffset, final String[] src, + final int srcOffset, final int size) { + for (int ii = 0; ii < size; ++ii) { + dest.set(destOffset + ii, src[srcOffset + ii]); + } + } + + private static void append(final ArrayList dest, final String[] src, final int srcOffset, + final int size) { + for (int ii = 0; ii < size; ++ii) { + dest.add(src[srcOffset + ii]); + } + } + } + + private static final class MyDateTimeAsLongSink extends MyLongSinkBase { + } + private static final class MyTimestampAsLongSink extends MyLongSinkBase { + } + + private static class MySinkFactory implements SinkFactory { + @Override + public Sink makeBooleanAsByteSink() { + return new MyBooleanAsByteSink(); + } + + @Override + public Sink makeByteSink() { + return new MyByteSink(); + } + + @Override + public Sink makeShortSink() { + return new MyShortSink(); + } + + @Override + public Sink makeIntSink() { + return new MyIntSink(); + } + + @Override + public Sink makeLongSink() { + return new MyLongSink(); + } + + @Override + public Sink makeFloatSink() { + return new MyFloatSink(); + } + + @Override + public Sink makeDoubleSink() { + return new MyDoubleSink(); + } + + @Override + public Sink makeCharSink() { + return new MyCharSink(); + } + + @Override + public Sink makeStringSink() { + return new MyStringSink(); + } + + @Override + public Sink makeDateTimeAsLongSink() { + return new MyDateTimeAsLongSink(); + } + + @Override + public Sink makeTimestampAsLongSink() { + return new MyTimestampAsLongSink(); + } + } +} diff --git a/extensions/csv/src/test/java/io/deephaven/csv/CsvTest.java b/extensions/csv/src/test/java/io/deephaven/csv/CsvTest.java deleted file mode 100644 index e8738950279..00000000000 --- a/extensions/csv/src/test/java/io/deephaven/csv/CsvTest.java +++ /dev/null @@ -1,529 +0,0 @@ -package io.deephaven.csv; - -import io.deephaven.qst.column.header.ColumnHeader; -import io.deephaven.qst.column.header.ColumnHeaders3; -import io.deephaven.qst.table.NewTable; -import io.deephaven.qst.table.TableHeader; -import org.assertj.core.api.Assertions; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; -import org.junit.runners.Parameterized.Parameters; - -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; -import java.nio.charset.StandardCharsets; -import java.time.Instant; -import java.time.LocalDateTime; -import java.time.ZoneOffset; -import java.util.Arrays; -import java.util.List; -import java.util.Objects; - -@RunWith(Parameterized.class) -public class CsvTest { - - private static final Instant TIMESTAMP_A = LocalDateTime.of(2021, 9, 27, 19, 0, 0).toInstant(ZoneOffset.UTC); - private static final Instant TIMESTAMP_B = LocalDateTime.of(2021, 9, 27, 20, 0, 0).toInstant(ZoneOffset.UTC); - - @Parameters(name = "{0}") - public static Iterable parameters() { - return () -> tests().stream().map(CsvTest::parameterize).iterator(); - } - - public static List tests() { - return Arrays.asList( - timestamp(), - timestampSeconds(), - timestampMillis(), - timestampMicros(), - timestampNanos(), - timestampMixed(), - timestampLegacy(), - bools(), - chars(), - byteIsShort(), - byteViaHeader(), - byteViaInference(), - shortRange(), - intRange(), - longRange(), - longAsStringsViaInference(), - longAsStringsViaParser(), - longAsStringsViaHeader(), - longOverrideStringInference(), - doubleRange(), - floatIsDouble(), - floatViaHeader(), - floatViaInference(), - floatFromDouble(), - strings(), - stringsPound(), - languageExample(), - languageExampleTsv(), - languageExampleHeaderless(), - languageExampleHeaderlessExplicit(), - whitespaceNoQuotes(), - whitespaceNoQuotesLiteral(), - whitespaceOutside(), - whitespaceInsideDefault(), - whitespaceInsideTrim(), - whitespaceInsideAndOutsideDefault(), - whitespaceInsideAndOutsideTrim()); - } - - public static CsvTest timestamp() { - final NewTable expected = ColumnHeader.ofInstant("Timestamp") - .row(TIMESTAMP_A) - .row(null) - .row(TIMESTAMP_B) - .newTable(); - return new CsvTest("timestamp", "timestamp.csv", CsvSpecs.csv(), expected); - } - - public static CsvTest timestampSeconds() { - final NewTable expected = ColumnHeader.ofInstant("Timestamp") - .row(TIMESTAMP_A) - .row(null) - .row(TIMESTAMP_B) - .newTable(); - return new CsvTest("timestampSeconds", "timestamp-seconds.csv", CsvSpecs.csv(), expected); - } - - public static CsvTest timestampMillis() { - final NewTable expected = ColumnHeader.ofInstant("Timestamp") - .row(TIMESTAMP_A) - .row(null) - .row(TIMESTAMP_B) - .newTable(); - return new CsvTest("timestampMillis", "timestamp-millis.csv", CsvSpecs.csv(), expected); - } - - public static CsvTest timestampMicros() { - final NewTable expected = ColumnHeader.ofInstant("Timestamp") - .row(TIMESTAMP_A) - .row(null) - .row(TIMESTAMP_B) - .newTable(); - return new CsvTest("timestampMicros", "timestamp-micros.csv", CsvSpecs.csv(), expected); - } - - public static CsvTest timestampNanos() { - final NewTable expected = ColumnHeader.ofInstant("Timestamp") - .row(TIMESTAMP_A) - .row(null) - .row(TIMESTAMP_B) - .newTable(); - return new CsvTest("timestampNanos", "timestamp-nanos.csv", CsvSpecs.csv(), expected); - } - - public static CsvTest timestampMixed() { - // Can't infer milli and micros in a single column - will parse as a long, not an Instant. - final NewTable expected = ColumnHeader.ofLong("Timestamp") - .row(TIMESTAMP_A.toEpochMilli()) - .row(null) - .row(TIMESTAMP_B.toEpochMilli() * 1000L) - .newTable(); - return new CsvTest("timestampMixed", "timestamp-mixed.csv", CsvSpecs.csv(), expected); - } - - public static CsvTest timestampLegacy() { - final NewTable expected = ColumnHeader.ofInstant("Timestamp") - .row(TIMESTAMP_A) - .row(null) - .row(TIMESTAMP_B) - .newTable(); - return new CsvTest("timestampLegacy", "timestamp-legacy.csv", CsvSpecs.csv(), expected); - } - - public static CsvTest bools() { - final NewTable expected = ColumnHeader.ofBoolean("Bool") - .row(true) - .row(null) - .row(false) - .row(true) - .row(false) - .row(true) - .row(false) - .newTable(); - return new CsvTest("bools", "bools.csv", CsvSpecs.csv(), expected); - } - - public static CsvTest chars() { - final NewTable expected = ColumnHeader.ofChar("Char") - .row('A') - .row(null) - .row('B') - .row('C') - .row('1') - .row('2') - .row('3') - .newTable(); - return new CsvTest("chars", "chars.csv", CsvSpecs.csv(), expected); - } - - public static CsvTest byteViaHeader() { - final NewTable expected = ColumnHeader.ofByte("Byte") - .row((byte) (Byte.MIN_VALUE + 1)) - .row(null) - .row(Byte.MAX_VALUE) - .newTable(); - return new CsvTest("byteViaHeader", "byte.csv", CsvSpecs.builder().header(expected.header()).build(), expected); - } - - public static CsvTest byteViaInference() { - final NewTable expected = ColumnHeader.ofByte("Byte") - .row((byte) (Byte.MIN_VALUE + 1)) - .row(null) - .row(Byte.MAX_VALUE) - .newTable(); - return new CsvTest("byteViaInference", "byte.csv", - CsvSpecs.builder().inference(InferenceSpecs.builder().addParsers(Parser.BYTE).build()).build(), - expected); - } - - public static CsvTest byteIsShort() { - // By default, byte will be parsed as short - final NewTable expected = ColumnHeader.ofShort("Byte") - .row((short) (Byte.MIN_VALUE + 1)) - .row(null) - .row((short) Byte.MAX_VALUE) - .newTable(); - return new CsvTest("byteIsShort", "byte.csv", CsvSpecs.csv(), expected); - } - - public static CsvTest shortRange() { - final NewTable expected = ColumnHeader.ofShort("Short") - .row((short) (Short.MIN_VALUE + 1)) - .row(null) - .row(Short.MAX_VALUE) - .newTable(); - return new CsvTest("shortRange", "short.csv", CsvSpecs.csv(), expected); - } - - public static CsvTest intRange() { - final NewTable expected = ColumnHeader.ofInt("Int") - .row(Integer.MIN_VALUE + 1) - .row(null) - .row(Integer.MAX_VALUE) - .newTable(); - return new CsvTest("intRange", "int.csv", CsvSpecs.csv(), expected); - } - - public static CsvTest longRange() { - final NewTable expected = ColumnHeader.ofLong("Long") - .row(Long.MIN_VALUE + 1) - .row(null) - .row(Long.MAX_VALUE) - .newTable(); - return new CsvTest("longRange", "long.csv", CsvSpecs.csv(), expected); - } - - public static CsvTest longAsStringsViaInference() { - final NewTable expected = ColumnHeader.ofString("Long") - .row("-9223372036854775807") - .row(null) - .row("9223372036854775807") - .newTable(); - return new CsvTest("longAsStringsViaInference", "long.csv", - CsvSpecs.builder().inference(InferenceSpecs.strings()).build(), expected); - } - - public static CsvTest longAsStringsViaParser() { - final NewTable expected = ColumnHeader.ofString("Long") - .row("-9223372036854775807") - .row(null) - .row("9223372036854775807") - .newTable(); - return new CsvTest("longAsStringsViaParser", "long.csv", - CsvSpecs.builder().putParsers("Long", Parser.STRING).build(), expected); - } - - public static CsvTest longAsStringsViaHeader() { - final NewTable expected = ColumnHeader.ofString("Long") - .row("-9223372036854775807") - .row(null) - .row("9223372036854775807") - .newTable(); - return new CsvTest("longAsStringsViaHeader", "long.csv", CsvSpecs.builder().header(expected.header()).build(), - expected); - } - - public static CsvTest longOverrideStringInference() { - final NewTable expected = ColumnHeader.ofLong("Long") - .row(Long.MIN_VALUE + 1) - .row(null) - .row(Long.MAX_VALUE) - .newTable(); - final InferenceSpecs stringOnlyInference = InferenceSpecs.strings(); - final TableHeader tableHeader = ColumnHeader.ofLong("Long").tableHeader(); - final CsvSpecs csvSpecs = CsvSpecs.builder().inference(stringOnlyInference).header(tableHeader).build(); - return new CsvTest("longOverrideStringInference", "long.csv", csvSpecs, expected); - } - - public static CsvTest floatIsDouble() { - // By defaults, floats are parsed as double - final NewTable expected = ColumnHeader.ofDouble("Float") - .row((double) Float.POSITIVE_INFINITY) - .row(null) - .row((double) Float.NEGATIVE_INFINITY) - .row((double) Float.NaN).row(3.4028235e+38d) - .row(1.17549435E-38d) - .row(1.4e-45d) - .newTable(); - return new CsvTest("floatIsDouble", "floats.csv", CsvSpecs.csv(), expected); - } - - public static CsvTest floatViaHeader() { - final NewTable expected = ColumnHeader.ofFloat("Float") - .row(Float.POSITIVE_INFINITY) - .row(null) - .row(Float.NEGATIVE_INFINITY) - .row(Float.NaN) - .row(Float.MAX_VALUE) - .row(Float.MIN_NORMAL) - .row(Float.MIN_VALUE) - .newTable(); - return new CsvTest("floatViaHeader", "floats.csv", CsvSpecs.builder().header(expected.header()).build(), - expected); - } - - public static CsvTest floatViaInference() { - final NewTable expected = ColumnHeader.ofFloat("Float") - .row(Float.POSITIVE_INFINITY) - .row(null) - .row(Float.NEGATIVE_INFINITY) - .row(Float.NaN) - .row(Float.MAX_VALUE) - .row(Float.MIN_NORMAL) - .row(Float.MIN_VALUE) - .newTable(); - return new CsvTest("floatViaInference", "floats.csv", - CsvSpecs.builder().inference(InferenceSpecs.builder().addParsers(Parser.FLOAT).build()).build(), - expected); - } - - public static CsvTest floatFromDouble() { - final NewTable expected = ColumnHeader.ofFloat("Double") - .row((float) Double.POSITIVE_INFINITY) - .row(null) - .row((float) Double.NEGATIVE_INFINITY) - .row((float) Double.NaN) - .row((float) Double.MAX_VALUE) - .row((float) Double.MIN_NORMAL) - .row((float) Double.MIN_VALUE) - .newTable(); - return new CsvTest("floatFromDouble", "doubles.csv", CsvSpecs.builder().header(expected.header()).build(), - expected); - } - - public static CsvTest doubleRange() { - final NewTable expected = ColumnHeader.ofDouble("Double") - .row(Double.POSITIVE_INFINITY) - .row(null) - .row(Double.NEGATIVE_INFINITY) - .row(Double.NaN) - .row(Double.MAX_VALUE) - .row(Double.MIN_NORMAL) - .row(Double.MIN_VALUE) - .newTable(); - return new CsvTest("doubleRange", "doubles.csv", CsvSpecs.csv(), expected); - } - - public static CsvTest strings() { - final NewTable expected = ColumnHeader.ofString("String") - .row("Hello, world") - .row(null) - .row("Goodbye.") - .newTable(); - return new CsvTest("strings", "strings.csv", CsvSpecs.csv(), expected); - } - - public static CsvTest stringsPound() { - final NewTable expected = ColumnHeader.ofString("String") - .row("Hello, world") - .row(null) - .row("Goodbye.") - .newTable(); - return new CsvTest("stringsPound", "strings-pound.csv", CsvSpecs.builder().quote('#').build(), expected); - } - - public static CsvTest languageExample() { - return new CsvTest("languageExample", "language-example.csv", CsvSpecs.csv(), languageCreatorTypeTable()); - } - - public static CsvTest languageExampleTsv() { - return new CsvTest("languageExampleTsv", "language-example.tsv", CsvSpecs.tsv(), languageCreatorTypeTable()); - } - - public static CsvTest languageExampleHeaderless() { - return new CsvTest("languageExampleHeaderless", "language-example-headerless.csv", CsvSpecs.headerless(), - languageCreatorTypeTableHeaderless()); - } - - public static CsvTest languageExampleHeaderlessExplicit() { - final NewTable expected = languageCreatorTypeTable(); - final CsvSpecs specs = CsvSpecs.headerless(expected.header()); - return new CsvTest("languageExampleHeaderlessExplicit", "language-example-headerless.csv", specs, expected); - } - - public static CsvTest whitespaceNoQuotes() { - final NewTable expected = ColumnHeader.of( - ColumnHeader.ofString("Sym"), - ColumnHeader.ofString("Type"), - ColumnHeader.ofDouble("Price"), - ColumnHeader.ofShort("SecurityId")) - .row("GOOG", "Dividend", 0.25, (short) 200) - .row("T", "Dividend", 0.15, (short) 300) - .row("Z", "Dividend", 0.18, (short) 500) - .newTable(); - final CsvSpecs specs = CsvSpecs.csv(); - return new CsvTest("whitespaceNoQuotes", "whitespace-no-quotes.csv", specs, expected); - } - - public static CsvTest whitespaceNoQuotesLiteral() { - final NewTable expected = ColumnHeader.of( - ColumnHeader.ofString("Sym"), - ColumnHeader.ofString("Type"), - ColumnHeader.ofString("Price"), - ColumnHeader.ofString("SecurityId")) - .row("GOOG", " Dividend", " 0.25", " 200") - .row("T", " Dividend", " 0.15", " 300") - .row(" Z", " Dividend", " 0.18", " 500") - .newTable(); - final CsvSpecs specs = CsvSpecs.builder().ignoreSurroundingSpaces(false).build(); - return new CsvTest("whitespaceNoQuotesLiteral", "whitespace-no-quotes.csv", specs, expected); - } - - public static CsvTest whitespaceOutside() { - final NewTable expected = ColumnHeader.of( - ColumnHeader.ofString("Sym"), - ColumnHeader.ofString("Type"), - ColumnHeader.ofDouble("Price"), - ColumnHeader.ofShort("SecurityId")) - .row("GOOG", "Dividend", 0.25, (short) 200) - .row("T", "Dividend", 0.15, (short) 300) - .row("Z", "Dividend", 0.18, (short) 500) - .newTable(); - final CsvSpecs specs = CsvSpecs.csv(); - return new CsvTest("whitespaceOutside", "whitespace-outside.csv", specs, expected); - } - - public static CsvTest whitespaceInsideDefault() { - final NewTable expected = ColumnHeader.of( - ColumnHeader.ofString("Sym"), - ColumnHeader.ofString("Type"), - ColumnHeader.ofString("Price"), - ColumnHeader.ofString("SecurityId")) - .row("GOOG", " Dividend", " 0.25", " 200") - .row("T", " Dividend", " 0.15", " 300") - .row(" Z", " Dividend", " 0.18", " 500") - .newTable(); - final CsvSpecs specs = CsvSpecs.csv(); - return new CsvTest("whitespaceInsideDefault", "whitespace-inside.csv", specs, expected); - } - - public static CsvTest whitespaceInsideTrim() { - final NewTable expected = ColumnHeader.of( - ColumnHeader.ofString("Sym"), - ColumnHeader.ofString("Type"), - ColumnHeader.ofDouble("Price"), - ColumnHeader.ofShort("SecurityId")) - .row("GOOG", "Dividend", 0.25, (short) 200) - .row("T", "Dividend", 0.15, (short) 300) - .row("Z", "Dividend", 0.18, (short) 500) - .newTable(); - final CsvSpecs specs = CsvSpecs.builder().trim(true).build(); - return new CsvTest("whitespaceInsideTrim", "whitespace-inside.csv", specs, expected); - } - - public static CsvTest whitespaceInsideAndOutsideDefault() { - final NewTable expected = ColumnHeader.of( - ColumnHeader.ofString("Sym"), - ColumnHeader.ofString("Type"), - ColumnHeader.ofString("Price"), - ColumnHeader.ofString("SecurityId")) - .row("GOOG", " Dividend", " 0.25", " 200") - .row("T", " Dividend", " 0.15", " 300") - .row(" Z", " Dividend", " 0.18", " 500") - .newTable(); - final CsvSpecs specs = CsvSpecs.csv(); - return new CsvTest("whitespaceInsideAndOutsideDefault", "whitespace-inside-and-outside.csv", specs, expected); - } - - public static CsvTest whitespaceInsideAndOutsideTrim() { - final NewTable expected = ColumnHeader.of( - ColumnHeader.ofString("Sym"), - ColumnHeader.ofString("Type"), - ColumnHeader.ofDouble("Price"), - ColumnHeader.ofShort("SecurityId")) - .row("GOOG", "Dividend", 0.25, (short) 200) - .row("T", "Dividend", 0.15, (short) 300) - .row("Z", "Dividend", 0.18, (short) 500) - .newTable(); - final CsvSpecs specs = CsvSpecs.builder().trim(true).build(); - return new CsvTest("whitespaceInsideAndOutsideTrim", "whitespace-inside-and-outside.csv", specs, expected); - } - - private static NewTable languageCreatorTypeTable() { - return populateLanguageExample(ColumnHeader.ofString("Language") - .header(ColumnHeader.ofString("Creator")) - .header(ColumnHeader.ofString("Type"))); - } - - private static NewTable languageCreatorTypeTableHeaderless() { - return populateLanguageExample(ColumnHeader.ofString("Column1") - .header(ColumnHeader.ofString("Column2")) - .header(ColumnHeader.ofString("Column3"))); - } - - private static NewTable populateLanguageExample(ColumnHeaders3 header) { - return header - .row("C", "Dennis Ritchie", "Compiled") - .row("C++", "Bjarne Stroustrup", "Compiled") - .row("Fortran", "John Backus", "Compiled") - .row("Java", "James Gosling", "Both") - .row("JavaScript", "Brendan Eich", "Interpreted") - .row("MATLAB", "Cleve Moler", "Interpreted") - .row("Pascal", "Niklas Wirth", "Compiled") - .row("Python", "Guido van Rossum", "Interpreted") - .newTable(); - } - - private final String name; - private final String resourceName; - private final CsvSpecs specs; - private final NewTable expected; - - public CsvTest(String name, String resourceName, CsvSpecs specs, NewTable expected) { - this.name = Objects.requireNonNull(name); - this.resourceName = Objects.requireNonNull(resourceName); - this.specs = Objects.requireNonNull(specs); - this.expected = expected; - } - - Object[] parameterize() { - return new Object[] {name, resourceName, specs, expected}; - } - - @Test - public void parseCsv() throws IOException { - final InputStream in = CsvTest.class.getResourceAsStream(resourceName); - if (in == null) { - throw new IllegalArgumentException("Unable to find resource " + resourceName); - } - final NewTable actual; - try (final Reader reader = new InputStreamReader(in, StandardCharsets.UTF_8)) { - actual = specs.parse(reader); - } catch (Parser.ParserException e) { - if (expected == null) { - // expected! - return; - } - throw e; - } - Assertions.assertThat(actual).isEqualTo(expected); - } -} diff --git a/extensions/csv/src/test/java/io/deephaven/csv/CsvTestCategory.java b/extensions/csv/src/test/java/io/deephaven/csv/CsvTestCategory.java new file mode 100644 index 00000000000..c8ae14809a6 --- /dev/null +++ b/extensions/csv/src/test/java/io/deephaven/csv/CsvTestCategory.java @@ -0,0 +1,8 @@ +package io.deephaven.csv; + +/** + * This probably will be deleted in the final version. The purpose is to help me run both the CSV-specific tests and the + * Deephaven-tests-of-CVS at the same time. + */ +public interface CsvTestCategory { +} diff --git a/extensions/csv/src/test/java/io/deephaven/csv/DeephavenCsvTest.java b/extensions/csv/src/test/java/io/deephaven/csv/DeephavenCsvTest.java new file mode 100644 index 00000000000..8b305f3d644 --- /dev/null +++ b/extensions/csv/src/test/java/io/deephaven/csv/DeephavenCsvTest.java @@ -0,0 +1,55 @@ +package io.deephaven.csv; + +import io.deephaven.engine.table.Table; +import io.deephaven.engine.util.TableTools; +import io.deephaven.time.DateTime; +import org.assertj.core.api.Assertions; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import java.io.*; +import java.time.LocalDateTime; +import java.time.ZoneId; + +@Category(CsvTestCategory.class) +public class DeephavenCsvTest { + + @Test + public void dateTimeCustomTimezone() { + final ZoneId nycId = ZoneId.of("America/New_York"); + final DateTime DATETIME_A = + DateTime.of(LocalDateTime.of(2019, 5, 2, 19, 33, 12, 123456789).atZone(nycId).toInstant()); + final DateTime DATETIME_B = + DateTime.of(LocalDateTime.of(2017, 2, 2, 3, 18, 55, 987654321).atZone(nycId).toInstant()); + + final String input = "" + + "Timestamp\n" + + "2019-05-02 19:33:12.123456789 NY\n" + + "\n" + + "2017-02-02T03:18:55.987654321 NY\n"; + + final Table expected = TableTools.newTable( + TableTools.col("Timestamp", DATETIME_A, null, DATETIME_B)); + + invokeTest(input, CsvSpecs.csv(), expected); + } + + private static void invokeTest(String input, CsvSpecs specs, Table expected) { + try { + final Table actual; + try (final Reader reader = new StringReader(input)) { + actual = specs.parse(reader); + } catch (Exception e) { + if (expected == null) { + // expected! + return; + } + throw e; + } + final String differences = TableTools.diff(actual, expected, 25); + Assertions.assertThat(differences).isEmpty(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } +} diff --git a/extensions/csv/src/test/java/io/deephaven/csv/InferenceTest.java b/extensions/csv/src/test/java/io/deephaven/csv/InferenceTest.java deleted file mode 100644 index 030ff9f4c13..00000000000 --- a/extensions/csv/src/test/java/io/deephaven/csv/InferenceTest.java +++ /dev/null @@ -1,163 +0,0 @@ -package io.deephaven.csv; - -import org.junit.Test; - -import java.util.Arrays; - -import static org.assertj.core.api.Assertions.assertThat; - -public class InferenceTest { - - @Test - public void singleParserNoBackupNoItems() { - noInfer(InferenceSpecs.builder().addParsers(Parser.INT).onNullParser(null).build()); - } - - @Test - public void singleParserBackupNoItems() { - infer(InferenceSpecs.builder().addParsers(Parser.INT).onNullParser(Parser.DOUBLE).build(), Parser.DOUBLE); - } - - @Test - public void singleParserNoBackupNullItems() { - noInfer(InferenceSpecs.builder().addParsers(Parser.INT).onNullParser(null).build(), null, null); - } - - @Test - public void singleParserBackupNullItems() { - infer(InferenceSpecs.builder().addParsers(Parser.INT).onNullParser(Parser.DOUBLE).build(), Parser.DOUBLE, null, - null); - } - - @Test - public void noItems() { - // all null defaults to string type - infer(Parser.STRING); - - // if we use a custom inference specs with null parser, will not infer - noInfer(InferenceSpecs.builder().addParsers(Parser.STRING).onNullParser(null).build()); - } - - @Test - public void allNull() { - // all null defaults to string type - infer(Parser.STRING, null, null, null, null); - - // if we use a custom inference specs with null parser, will not infer - noInfer(InferenceSpecs.builder().addParsers(Parser.STRING).onNullParser(null).build(), null, null, null, null); - } - - @Test - public void mixedType() { - infer(Parser.STRING, "1.0", "1", null, "true", "False"); - } - - @Test - public void stringType() { - infer(Parser.STRING, "this", "should", null, "be", "a", "string"); - } - - @Test - public void boolType() { - infer(Parser.BOOL, "true", null, "True", "false", "False"); - } - - @Test - public void notQuiteBool() { - infer(Parser.STRING, "true", null, "1", "false", "False"); - infer(Parser.STRING, "true", null, "yes", "false", "False"); - infer(Parser.STRING, "true", null, "", "false", "False"); - } - - @Test - public void charType() { - infer(Parser.CHAR, "a", null, "b", "c", "c"); - } - - @Test - public void notQuiteChar() { - infer(Parser.STRING, "a", null, "", "c", "c"); - infer(Parser.STRING, "a", null, "bb", "c", "c"); - } - - @Test - public void shortType() { - infer(Parser.SHORT, "1", "2", null, "-1", String.valueOf(Short.MAX_VALUE)); - } - - @Test - public void notQuiteShort() { - infer(Parser.INT, "1", "2", null, "-1", String.valueOf(Short.MAX_VALUE + 1)); - infer(Parser.STRING, "1", "2", null, "-1", ""); - } - - @Test - public void intType() { - infer(Parser.INT, "1", "2", null, "-1", String.valueOf(Integer.MAX_VALUE)); - } - - @Test - public void notQuiteInt() { - infer(Parser.LONG, "1", "2", null, "-1", String.valueOf(Integer.MAX_VALUE + 1L)); - infer(Parser.STRING, "1", "2", null, "-1", ""); - } - - @Test - public void longType() { - infer(Parser.LONG, "1", "2", null, "-1", String.valueOf(Long.MAX_VALUE)); - } - - @Test - public void notQuiteLong() { - // one more than Long.MAX_VALUE - infer(Parser.DOUBLE, "1", "2", null, "-1", "9223372036854775808"); - infer(Parser.STRING, "1", "2", null, "-1", ""); - } - - @Test - public void doubleType() { - infer(Parser.DOUBLE, "1", "1.1", null, "-1", String.valueOf(Double.MIN_VALUE)); - infer(Parser.DOUBLE, "1", "1.1", null, "-1", String.valueOf(Double.MAX_VALUE)); - infer(Parser.DOUBLE, "1", "1.1", null, "-1", String.valueOf(Double.NEGATIVE_INFINITY)); - infer(Parser.DOUBLE, "1", "1.1", null, "-1", String.valueOf(Double.POSITIVE_INFINITY)); - infer(Parser.DOUBLE, "1", "1.1", null, "-1", String.valueOf(Double.NaN)); - infer(Parser.DOUBLE, "1", "1.1", null, "-1", String.valueOf(Double.MIN_NORMAL)); - } - - @Test - public void notQuiteDouble() { - infer(Parser.STRING, "1", "1.1", null, "-1", ""); - } - - @Test - public void instantType() { - infer(Parser.INSTANT, "2019-08-25T11:34:56.000Z", null, "2021-01-01T09:00:00Z"); - } - - @Test - public void notQuiteInstant() { - infer(Parser.STRING, "2019-08-25T11:34:56.000Z", null, ""); - } - - @Test - public void shortCircuitEvenIfEventuallyIncorrect() { - final InferenceSpecs shortOrInt = InferenceSpecs.builder().addParsers(Parser.SHORT, Parser.INT).build(); - infer(shortOrInt, Parser.INT, "1", "2", Integer.toString(Short.MAX_VALUE + 1), "This is not an int"); - } - - private static void infer(Parser expected, String... values) { - infer(InferenceSpecs.standard(), expected, values); - } - - private static void noInfer(String... values) { - noInfer(InferenceSpecs.standard(), values); - } - - private static void infer(InferenceSpecs specs, Parser expected, String... values) { - assertThat(specs.infer(Arrays.asList(values).iterator())).contains(expected); - } - - private static void noInfer(InferenceSpecs specs, String... values) { - assertThat(specs.infer(Arrays.asList(values).iterator())).isEmpty(); - } -} diff --git a/extensions/csv/src/test/java/io/deephaven/csv/TestCsvTools.java b/extensions/csv/src/test/java/io/deephaven/csv/TestCsvTools.java index 71bd5e2daaa..78d7c42248a 100644 --- a/extensions/csv/src/test/java/io/deephaven/csv/TestCsvTools.java +++ b/extensions/csv/src/test/java/io/deephaven/csv/TestCsvTools.java @@ -1,6 +1,7 @@ package io.deephaven.csv; import io.deephaven.base.FileUtils; +import io.deephaven.csv.util.CsvReaderException; import io.deephaven.engine.table.Table; import io.deephaven.engine.table.TableDefinition; import io.deephaven.engine.table.impl.InMemoryTable; @@ -32,7 +33,7 @@ /** * Unit tests for {@link CsvTools}. */ -@Category(OutOfBandTest.class) +@Category({OutOfBandTest.class, CsvTestCategory.class}) public class TestCsvTools { private File tmpDir; @@ -48,44 +49,36 @@ public void removeDir() { } @Test - public void testTableDividendsCSV() { + public void testTableDividendsCSV() throws CsvReaderException { final String fileDividends = "Sym,Type,Price,SecurityId\n" + "GOOG, Dividend, 0.25, 200\n" + "T, Dividend, 0.15, 300\n" + " Z, Dividend, 0.18, 500"; - try { - Table tableDividends = CsvTools.readCsv(new ByteArrayInputStream(fileDividends.getBytes())); - Assert.assertEquals(3, tableDividends.size()); - Assert.assertEquals(4, tableDividends.getMeta().size()); - Assert.assertEquals(0.15, tableDividends.getColumn(2).getDouble(1), 0.000001); - Assert.assertEquals(300, tableDividends.getColumn(3).getShort(1)); - Assert.assertEquals("Z", tableDividends.getColumn(0).get(2)); - } catch (IOException e) { - throw new RuntimeException("Failed to execute readCSV test. ", e); - } + Table tableDividends = CsvTools.readCsv(new ByteArrayInputStream(fileDividends.getBytes())); + Assert.assertEquals(3, tableDividends.size()); + Assert.assertEquals(4, tableDividends.getMeta().size()); + Assert.assertEquals(0.15, tableDividends.getColumn(2).getDouble(1), 0.000001); + Assert.assertEquals(300, tableDividends.getColumn(3).getInt(1)); + Assert.assertEquals("Z", tableDividends.getColumn(0).get(2)); } @Test - public void testTableDividendsCSVNoTrim() { + public void testTableDividendsCSVNoTrim() throws CsvReaderException { final String fileDividends = "Sym,Type,Price,SecurityId\n" + "GOOG, Dividend, 0.25, 200\n" + "T, Dividend, 0.15, 300\n" + " Z, Dividend, 0.18, 500"; - try { - Table tableDividends = CsvTools - .readCsv(new ByteArrayInputStream(fileDividends.getBytes()), "DEFAULT"); - Assert.assertEquals(3, tableDividends.size()); - Assert.assertEquals(4, tableDividends.getMeta().size()); - Assert.assertEquals(" 0.15", tableDividends.getColumn(2).get(1)); - Assert.assertEquals(" 300", tableDividends.getColumn(3).get(1)); - Assert.assertEquals(" Z", tableDividends.getColumn(0).get(2)); - } catch (IOException e) { - throw new RuntimeException("Failed to execute readCSV test. ", e); - } + Table tableDividends = CsvTools + .readCsv(new ByteArrayInputStream(fileDividends.getBytes()), "DEFAULT"); + Assert.assertEquals(3, tableDividends.size()); + Assert.assertEquals(4, tableDividends.getMeta().size()); + Assert.assertEquals(0.15, tableDividends.getColumn(2).get(1)); + Assert.assertEquals(300, tableDividends.getColumn(3).get(1)); + Assert.assertEquals(" Z", tableDividends.getColumn(0).get(2)); } @Test - public void testCompressedCSV() throws IOException { + public void testCompressedCSV() throws IOException, CsvReaderException { final String contents = "A,B,C,D\n" + "\"Hello World\",3.0,5,700\n" + "\"Goodbye Cruel World\",3.1,1000000,800\n" @@ -112,7 +105,7 @@ public void testCompressedCSV() throws IOException { } @Test - public void testUncompressedCSVFromPath() throws IOException { + public void testUncompressedCSVFromPath() throws IOException, CsvReaderException { String contents = "A,B,C,D\n" + "\"Hello World\",3.0,5,700\n" + "\"Goodbye Cruel World\",3.1,1000000,800\n" @@ -134,7 +127,7 @@ public void testUncompressedCSVFromPath() throws IOException { @Test public void testLoadCsv() throws Exception { - String allSeparators = ",|\tzZ- €9@"; + final String allSeparators = ",|\tzZ- €9@"; System.out.println("Char Set: " + Charset.defaultCharset().displayName()); for (char separator : allSeparators.toCharArray()) { @@ -168,7 +161,7 @@ public void testLoadCsv() throws Exception { Assert.assertEquals(String.class, definition.getColumnList().get(0).getDataType()); Assert.assertEquals("colB", definition.getColumnList().get(1).getName()); - Assert.assertEquals(short.class, definition.getColumnList().get(1).getDataType()); + Assert.assertEquals(int.class, definition.getColumnList().get(1).getDataType()); Assert.assertEquals("colC", definition.getColumnList().get(2).getName()); Assert.assertEquals(double.class, definition.getColumnList().get(2).getDataType()); @@ -186,19 +179,19 @@ public void testLoadCsv() throws Exception { Assert.assertEquals(Boolean.class, definition.getColumnList().get(6).getDataType()); Assert.assertEquals(String.format("mark1%smark2", separator), table.getColumn("colA").get(0)); - Assert.assertEquals(1, table.getColumn("colB").getShort(0)); + Assert.assertEquals(1, table.getColumn("colB").getInt(0)); Assert.assertEquals(1.0, table.getColumn("colC").getDouble(0), 0.000001); Assert.assertEquals("1", table.getColumn("colD").get(0)); - Assert.assertEquals(null, table.getColumn("colE").get(0)); - Assert.assertEquals(null, table.getColumn("colF").get(0)); + Assert.assertNull(table.getColumn("colE").get(0)); + Assert.assertNull(table.getColumn("colF").get(0)); Assert.assertEquals(Boolean.TRUE, table.getColumn("colG").getBoolean(0)); - Assert.assertEquals(null, table.getColumn("colA").get(2)); - Assert.assertEquals(QueryConstants.NULL_SHORT, table.getColumn("colB").getShort(2)); + Assert.assertNull(table.getColumn("colA").get(2)); + Assert.assertEquals(QueryConstants.NULL_INT, table.getColumn("colB").getInt(2)); Assert.assertEquals(QueryConstants.NULL_DOUBLE, table.getColumn("colC").getDouble(2), 0.0000001); - Assert.assertEquals(null, table.getColumn("colD").get(2)); - Assert.assertEquals(null, table.getColumn("colE").get(2)); - Assert.assertEquals(null, table.getColumn("colF").get(2)); + Assert.assertNull(table.getColumn("colD").get(2)); + Assert.assertNull(table.getColumn("colE").get(2)); + Assert.assertNull(table.getColumn("colF").get(2)); Assert.assertEquals(QueryConstants.NULL_BOOLEAN, table.getColumn("colG").getBoolean(2)); } } diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/bools.csv b/extensions/csv/src/test/resources/io/deephaven/csv/bools.csv deleted file mode 100644 index ba641167774..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/bools.csv +++ /dev/null @@ -1,8 +0,0 @@ -Bool -true -, -false -True -False -TrUe -FALSE diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/byte.csv b/extensions/csv/src/test/resources/io/deephaven/csv/byte.csv deleted file mode 100644 index 7cbc6ff227a..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/byte.csv +++ /dev/null @@ -1,4 +0,0 @@ -Byte --127 -, -127 diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/chars.csv b/extensions/csv/src/test/resources/io/deephaven/csv/chars.csv deleted file mode 100644 index c7e8f615077..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/chars.csv +++ /dev/null @@ -1,8 +0,0 @@ -Char -A -, -B -C -1 -2 -3 diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/doubles.csv b/extensions/csv/src/test/resources/io/deephaven/csv/doubles.csv deleted file mode 100644 index 744ff32abd4..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/doubles.csv +++ /dev/null @@ -1,8 +0,0 @@ -Double -Infinity -, --Infinity -NaN -1.7976931348623157e+308 -2.2250738585072014E-308 -4.9e-324 diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/floats.csv b/extensions/csv/src/test/resources/io/deephaven/csv/floats.csv deleted file mode 100644 index b94fa3f3c27..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/floats.csv +++ /dev/null @@ -1,8 +0,0 @@ -Float -Infinity -, --Infinity -NaN -3.4028235e+38f -1.17549435E-38f -1.4e-45f diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/int.csv b/extensions/csv/src/test/resources/io/deephaven/csv/int.csv deleted file mode 100644 index e5dc41bb7e0..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/int.csv +++ /dev/null @@ -1,4 +0,0 @@ -Int --2147483647 -, -2147483647 diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/language-example-headerless.csv b/extensions/csv/src/test/resources/io/deephaven/csv/language-example-headerless.csv deleted file mode 100644 index d1cd45b8b4d..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/language-example-headerless.csv +++ /dev/null @@ -1,8 +0,0 @@ -C,Dennis Ritchie,Compiled -C++,Bjarne Stroustrup,Compiled -Fortran,John Backus,Compiled -Java,James Gosling,Both -JavaScript,Brendan Eich,Interpreted -MATLAB,Cleve Moler,Interpreted -Pascal,Niklas Wirth,Compiled -Python,Guido van Rossum,Interpreted \ No newline at end of file diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/language-example.csv b/extensions/csv/src/test/resources/io/deephaven/csv/language-example.csv deleted file mode 100644 index ffeeb71d53a..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/language-example.csv +++ /dev/null @@ -1,9 +0,0 @@ -Language,Creator,Type -C,Dennis Ritchie,Compiled -C++,Bjarne Stroustrup,Compiled -Fortran,John Backus,Compiled -Java,James Gosling,Both -JavaScript,Brendan Eich,Interpreted -MATLAB,Cleve Moler,Interpreted -Pascal,Niklas Wirth,Compiled -Python,Guido van Rossum,Interpreted \ No newline at end of file diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/language-example.tsv b/extensions/csv/src/test/resources/io/deephaven/csv/language-example.tsv deleted file mode 100644 index 4219cecbec4..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/language-example.tsv +++ /dev/null @@ -1,9 +0,0 @@ -Language Creator Type -C Dennis Ritchie Compiled -C++ Bjarne Stroustrup Compiled -Fortran John Backus Compiled -Java James Gosling Both -JavaScript Brendan Eich Interpreted -MATLAB Cleve Moler Interpreted -Pascal Niklas Wirth Compiled -Python Guido van Rossum Interpreted \ No newline at end of file diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/long.csv b/extensions/csv/src/test/resources/io/deephaven/csv/long.csv deleted file mode 100644 index ff0a0cb279b..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/long.csv +++ /dev/null @@ -1,4 +0,0 @@ -Long --9223372036854775807 -, -9223372036854775807 diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/short.csv b/extensions/csv/src/test/resources/io/deephaven/csv/short.csv deleted file mode 100644 index 83c29f32203..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/short.csv +++ /dev/null @@ -1,4 +0,0 @@ -Short --32767 -, -32767 diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/strings-pound.csv b/extensions/csv/src/test/resources/io/deephaven/csv/strings-pound.csv deleted file mode 100644 index b413d43b0b7..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/strings-pound.csv +++ /dev/null @@ -1,4 +0,0 @@ -String -#Hello, world# -, -Goodbye. \ No newline at end of file diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/strings.csv b/extensions/csv/src/test/resources/io/deephaven/csv/strings.csv deleted file mode 100644 index 61f95587a10..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/strings.csv +++ /dev/null @@ -1,4 +0,0 @@ -String -"Hello, world" -, -Goodbye. \ No newline at end of file diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-legacy.csv b/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-legacy.csv deleted file mode 100644 index 182814c1d68..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-legacy.csv +++ /dev/null @@ -1,4 +0,0 @@ -Timestamp -2021-09-27T19:00:00 UTC -, -2021-09-27T20:00:00 UTC diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-micros.csv b/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-micros.csv deleted file mode 100644 index bc73d5759f5..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-micros.csv +++ /dev/null @@ -1,4 +0,0 @@ -Timestamp -1632769200000000 -, -1632772800000000 \ No newline at end of file diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-millis.csv b/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-millis.csv deleted file mode 100644 index 44d4fdd5ea4..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-millis.csv +++ /dev/null @@ -1,4 +0,0 @@ -Timestamp -1632769200000 -, -1632772800000 \ No newline at end of file diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-mixed.csv b/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-mixed.csv deleted file mode 100644 index 4ba0ce1ad94..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-mixed.csv +++ /dev/null @@ -1,4 +0,0 @@ -Timestamp -1632769200000 -, -1632772800000000 diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-nanos.csv b/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-nanos.csv deleted file mode 100644 index 8db502fa0de..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-nanos.csv +++ /dev/null @@ -1,4 +0,0 @@ -Timestamp -1632769200000000000 -, -1632772800000000000 \ No newline at end of file diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-seconds.csv b/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-seconds.csv deleted file mode 100644 index bdb1935814f..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/timestamp-seconds.csv +++ /dev/null @@ -1,4 +0,0 @@ -Timestamp -1632769200 -, -1632772800 \ No newline at end of file diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/timestamp.csv b/extensions/csv/src/test/resources/io/deephaven/csv/timestamp.csv deleted file mode 100644 index 406bd36c6dc..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/timestamp.csv +++ /dev/null @@ -1,4 +0,0 @@ -Timestamp -2021-09-27T19:00:00Z -, -2021-09-27T20:00:00Z diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/whitespace-inside-and-outside.csv b/extensions/csv/src/test/resources/io/deephaven/csv/whitespace-inside-and-outside.csv deleted file mode 100644 index 7a029354d59..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/whitespace-inside-and-outside.csv +++ /dev/null @@ -1,4 +0,0 @@ -Sym,Type,Price,SecurityId -"GOOG", " Dividend", " 0.25", " 200" -"T", " Dividend", " 0.15", " 300" -" Z", " Dividend", " 0.18", " 500" \ No newline at end of file diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/whitespace-inside.csv b/extensions/csv/src/test/resources/io/deephaven/csv/whitespace-inside.csv deleted file mode 100644 index 4745ea745b1..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/whitespace-inside.csv +++ /dev/null @@ -1,4 +0,0 @@ -Sym,Type,Price,SecurityId -"GOOG"," Dividend"," 0.25"," 200" -"T"," Dividend"," 0.15"," 300" -" Z"," Dividend"," 0.18"," 500" \ No newline at end of file diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/whitespace-no-quotes.csv b/extensions/csv/src/test/resources/io/deephaven/csv/whitespace-no-quotes.csv deleted file mode 100644 index 6e3c34c7fd3..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/whitespace-no-quotes.csv +++ /dev/null @@ -1,4 +0,0 @@ -Sym,Type,Price,SecurityId -GOOG, Dividend, 0.25, 200 -T, Dividend, 0.15, 300 - Z, Dividend, 0.18, 500 \ No newline at end of file diff --git a/extensions/csv/src/test/resources/io/deephaven/csv/whitespace-outside.csv b/extensions/csv/src/test/resources/io/deephaven/csv/whitespace-outside.csv deleted file mode 100644 index 8bff3eac84e..00000000000 --- a/extensions/csv/src/test/resources/io/deephaven/csv/whitespace-outside.csv +++ /dev/null @@ -1,4 +0,0 @@ -Sym,Type,Price,SecurityId -"GOOG", "Dividend", "0.25", "200" -"T", "Dividend", "0.15", "300" - "Z", "Dividend", "0.18", "500" \ No newline at end of file diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/KeyValuePartitionLayout.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/KeyValuePartitionLayout.java index d3c2119c555..c558668d91d 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/KeyValuePartitionLayout.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/KeyValuePartitionLayout.java @@ -1,6 +1,7 @@ package io.deephaven.parquet.table.layout; import io.deephaven.base.verify.Require; +import io.deephaven.csv.util.CsvReaderException; import io.deephaven.engine.table.Table; import io.deephaven.api.util.NameValidator; import io.deephaven.csv.CsvTools; @@ -142,7 +143,7 @@ public FileVisitResult postVisitDirectory(@NotNull final Path dir, try { partitioningColumnTable = csvBuilder.length() == 0 ? TableTools.emptyTable(targetFiles.size()) : CsvTools.readCsv(new ByteArrayInputStream(csvBuilder.toString().getBytes())); - } catch (IOException e) { + } catch (CsvReaderException e) { throw new TableDataException("Failed converting partition CSV to table for " + tableRootDirectory, e); } diff --git a/grpc-api/src/main/java/io/deephaven/grpc_api/uri/CsvTableResolver.java b/grpc-api/src/main/java/io/deephaven/grpc_api/uri/CsvTableResolver.java index f79b6e0f8a7..9ba4cc34137 100644 --- a/grpc-api/src/main/java/io/deephaven/grpc_api/uri/CsvTableResolver.java +++ b/grpc-api/src/main/java/io/deephaven/grpc_api/uri/CsvTableResolver.java @@ -1,6 +1,7 @@ package io.deephaven.grpc_api.uri; import io.deephaven.csv.CsvTools; +import io.deephaven.csv.util.CsvReaderException; import io.deephaven.engine.table.Table; import io.deephaven.uri.resolver.UriResolver; import io.deephaven.uri.resolver.UriResolversInstance; @@ -57,7 +58,12 @@ public Table resolve(URI uri) { } public Table read(URI uri) throws IOException { - return CsvTools.readCsv(csvString(uri)); + try { + return CsvTools.readCsv(csvString(uri)); + } catch (CsvReaderException inner) { + throw new RuntimeException("Caught exception", inner); + + } } private static String csvString(URI uri) {