Skip to content

Commit

Permalink
Run spotless, fix some stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
kosak committed Dec 16, 2021
1 parent 75fae21 commit a4b6ca5
Show file tree
Hide file tree
Showing 36 changed files with 440 additions and 466 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public static InMemoryTable from(NewTable table) {

// TODO(kosak): this may not be what we want.
public static InMemoryTable from(TableDefinition definition, TrackingRowSet rowSet,
Map<String, ? extends ColumnSource<?>> columns) {
Map<String, ? extends ColumnSource<?>> columns) {
return new InMemoryTable(definition, rowSet, columns);
}

Expand Down
11 changes: 6 additions & 5 deletions extensions/csv/src/main/java/io/deephaven/csv/CsvSpecs.java
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,7 @@ public static CsvSpecs fromLegacyFormat(String format) {
* A header, when specified, hints at the parser to use.
*
* <p>
* To be even more explicit, callers may also use {@link #parserForName()} or
* {@link #parserForIndex()}.
* To be even more explicit, callers may also use {@link #parserForName()} or {@link #parserForIndex()}.
*
* @return the table header
*/
Expand All @@ -159,7 +158,8 @@ public static CsvSpecs fromLegacyFormat(String format) {
public abstract Map<String, ParserBase<?>> parserForName();

/**
* The parsers, where the keys are 1-based column indices. Specifying a parser for a column forgoes inference for that column.
* The parsers, where the keys are 1-based column indices. Specifying a parser for a column forgoes inference for
* that column.
*
* @return the parsers
*/
Expand Down Expand Up @@ -264,6 +264,7 @@ public Charset charset() {

/**
* Should the CSVReader run its processing steps on multiple threads for better performance.
*
* @return the async flag
*/
@Default
Expand Down Expand Up @@ -367,7 +368,7 @@ private static abstract class MySinkBase<TYPE, TARRAY> implements Sink<TARRAY> {
private long resultSize = 0;

public MySinkBase(ArrayBackedColumnSource<TYPE> result, Class<?> interpClass,
ChunkWrapInvoker<TARRAY> chunkWrapInvoker) {
ChunkWrapInvoker<TARRAY> chunkWrapInvoker) {
this.result = result;
if (interpClass != null) {
reinterpreted = (WritableColumnSource<?>) result.reinterpret(interpClass);
Expand All @@ -386,7 +387,7 @@ public final void write(TARRAY src, int srcOffset, long destOffset, int size) {
reinterpreted.ensureCapacity(requiredCapacity);
resultSize = Math.max(resultSize, requiredCapacity);
try (final ChunkSink.FillFromContext context = reinterpreted.makeFillFromContext(size);
final RowSequence range = RowSequenceFactory.forRange(destOffset, destOffset + size - 1)) {
final RowSequence range = RowSequenceFactory.forRange(destOffset, destOffset + size - 1)) {
Chunk<? extends Values> chunk = chunkWrapInvoker.apply(src, srcOffset, size);
reinterpreted.fillFromChunk(context, chunk, range);
}
Expand Down
74 changes: 37 additions & 37 deletions extensions/csv/src/main/java/io/deephaven/csv/InferenceSpecs.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ public static InferenceSpecs strings() {
*/
public static InferenceSpecs minimal() {
return builder().addParsers(
Parsers.DATETIME,
Parsers.LONG,
Parsers.DOUBLE,
Parsers.BOOLEAN,
Parsers.STRING)
Parsers.DATETIME,
Parsers.LONG,
Parsers.DOUBLE,
Parsers.BOOLEAN,
Parsers.STRING)
.build();
}

Expand All @@ -59,14 +59,14 @@ public static InferenceSpecs minimal() {
*/
public static InferenceSpecs standard() {
return builder().addParsers(
Parsers.DATETIME,
Parsers.SHORT,
Parsers.INT,
Parsers.LONG,
Parsers.DOUBLE,
Parsers.BOOLEAN,
Parsers.CHAR,
Parsers.STRING)
Parsers.DATETIME,
Parsers.SHORT,
Parsers.INT,
Parsers.LONG,
Parsers.DOUBLE,
Parsers.BOOLEAN,
Parsers.CHAR,
Parsers.STRING)
.build();
}

Expand All @@ -77,45 +77,45 @@ public static InferenceSpecs standard() {
*/
public static InferenceSpecs standardTimes() {
return builder().addParsers(
Parsers.TIMESTAMP,
Parsers.FLOAT,
Parsers.DOUBLE,
Parsers.BOOLEAN,
Parsers.CHAR,
Parsers.STRING)
Parsers.TIMESTAMP,
Parsers.FLOAT,
Parsers.DOUBLE,
Parsers.BOOLEAN,
Parsers.CHAR,
Parsers.STRING)
.build();
}

public static InferenceSpecs milliTimes() {
return builder().addParsers(
Parsers.TIMESTAMP_MILLIS,
Parsers.FLOAT,
Parsers.DOUBLE,
Parsers.BOOLEAN,
Parsers.CHAR,
Parsers.STRING)
Parsers.TIMESTAMP_MILLIS,
Parsers.FLOAT,
Parsers.DOUBLE,
Parsers.BOOLEAN,
Parsers.CHAR,
Parsers.STRING)
.build();
}

public static InferenceSpecs microTimes() {
return builder().addParsers(
Parsers.TIMESTAMP_MICROS,
Parsers.FLOAT,
Parsers.DOUBLE,
Parsers.BOOLEAN,
Parsers.CHAR,
Parsers.STRING)
Parsers.TIMESTAMP_MICROS,
Parsers.FLOAT,
Parsers.DOUBLE,
Parsers.BOOLEAN,
Parsers.CHAR,
Parsers.STRING)
.build();
}

public static InferenceSpecs nanoTimes() {
return builder().addParsers(
Parsers.TIMESTAMP_NANOS,
Parsers.FLOAT,
Parsers.DOUBLE,
Parsers.BOOLEAN,
Parsers.CHAR,
Parsers.STRING)
Parsers.TIMESTAMP_NANOS,
Parsers.FLOAT,
Parsers.DOUBLE,
Parsers.BOOLEAN,
Parsers.CHAR,
Parsers.STRING)
.build();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ public final class ByteSlice {
private int begin;
private int end;

public ByteSlice() {
}
public ByteSlice() {}

public ByteSlice(byte[] data, int begin, int end) {
reset(data, begin, end);
Expand Down Expand Up @@ -34,9 +33,18 @@ public byte back() {
return data[end - 1];
}

public byte[] data() { return data; }
public int begin() { return begin; }
public int end() { return end; }
public byte[] data() {
return data;
}

public int begin() {
return begin;
}

public int end() {
return end;
}

public int size() {
return end - begin;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public char back() {

public void copyTo(byte[] dest, int destOffset) {
for (int cur = begin; cur != end; ++cur) {
dest[destOffset++] = (byte)data[cur];
dest[destOffset++] = (byte) data[cur];
}
}

Expand All @@ -41,12 +41,21 @@ public void setEnd(int end) {
this.end = end;
}

public char[] data() { return data; }
public char[] data() {
return data;
}

public int begin() {
return begin;
}
public int end() { return end; }
public int size() { return end - begin; }

public int end() {
return end;
}

public int size() {
return end - begin;
}

@Override
public String toString() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ public final class DenseStorageReader {
private final int[] intHolder = new int[1];

public DenseStorageReader(final QueueReader.IntReader controlReader,
final QueueReader.ByteReader byteReader,
final QueueReader.CharReader charReader,
final QueueReader.ByteArrayReader largeByteArrayReader,
final QueueReader.CharArrayReader largeCharArrayReader) {
final QueueReader.ByteReader byteReader,
final QueueReader.CharReader charReader,
final QueueReader.ByteArrayReader largeByteArrayReader,
final QueueReader.CharArrayReader largeCharArrayReader) {
this.controlReader = controlReader;
this.byteReader = byteReader;
this.charReader = charReader;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,81 +3,81 @@
import io.deephaven.csv.containers.CharSlice;

/**
* The DenseStorageWriter and {@link DenseStorageReader} work in tandem, forming a FIFO queue. The DenseStorageWriter writes
* data, and the {@link DenseStorageReader} reads that data. If the {@link DenseStorageReader} "catches up", it will block until
* the DenseStorageWriter provides more data, or indicates that it is done (via the {@link #finish()} method.
* This synchronization is done at "block" granularity, so the DenseStorageReader can only proceed when the
* The DenseStorageWriter and {@link DenseStorageReader} work in tandem, forming a FIFO queue. The DenseStorageWriter
* writes data, and the {@link DenseStorageReader} reads that data. If the {@link DenseStorageReader} "catches up", it
* will block until the DenseStorageWriter provides more data, or indicates that it is done (via the {@link #finish()}
* method. This synchronization is done at "block" granularity, so the DenseStorageReader can only proceed when the
* DenseStorageWriter has written at least a "block" of data or is done. We allow multiple independent
* {@link DenseStorageReader}s to consume the same underlying data. In our implementation this is used so our
* type inferencer can take a second "pass" over the same input data.
* {@link DenseStorageReader}s to consume the same underlying data. In our implementation this is used so our type
* inferencer can take a second "pass" over the same input data.
*
* The point of this object is to store a sequence of (character sequences aka "strings", but not java.lang.String),
* using a small fraction of overhead. The problem with storing every character sequence as a java.lang.String is:
* <ol>
* <li>Per-object overhead (8 or 16 bytes)</li>
* <li>The memory cost of holding a reference to that String (4 or 8 bytes)</li>
* <li>The string has to know its length (4 bytes)</li>
* <li>Java characters are 2 bytes even though in practice many strings are ASCII-only and their chars can fit in a byte</li>
* <li>Java characters are 2 bytes even though in practice many strings are ASCII-only and their chars can fit in a
* byte</li>
* </ol>
*
* For small strings (say the word "hello" or the input text "12345.6789" ) the overhead can be 100% or worse.
*
* For our purposes we:
* <ol>
* <li>Only need sequential access. i.e. we don't need random access into the sequence of "strings". So we can support
* a model where we can have a forward-only cursor moving over the sequence of "strings".</li>
* <li>Don't need to give our caller a data structure that they can hold on to. The caller only gets a "view"
* (a slice) of the current "string" data. The view is invalidated when they move to the next "string"</li>
* <li>Only need sequential access. i.e. we don't need random access into the sequence of "strings". So we can support a
* model where we can have a forward-only cursor moving over the sequence of "strings".</li>
* <li>Don't need to give our caller a data structure that they can hold on to. The caller only gets a "view" (a slice)
* of the current "string" data. The view is invalidated when they move to the next "string"</li>
* </ol>
*
* Furthermore we:
* <ol>
* <li>Offer a FIFO model where the reader (in a separate thread) can chase the writer but there is not an
* inordinate amount of synchronization overhead.</li>
* <li>Have the ability to make multiple Readers which pass over the same underlying data. This is our
* low-drama way of allowing our client to make multiple passes over the data, without complicating the iteration
* interface, with, e.g., a reset method.</li>
* <li>Use a linked-list structure so that when all existing readers have move passed a block of data, that
* block can be freed by the garbage collector.</li>
* <li>Offer a FIFO model where the reader (in a separate thread) can chase the writer but there is not an inordinate
* amount of synchronization overhead.</li>
* <li>Have the ability to make multiple Readers which pass over the same underlying data. This is our low-drama way of
* allowing our client to make multiple passes over the data, without complicating the iteration interface, with, e.g.,
* a reset method.</li>
* <li>Use a linked-list structure so that when all existing readers have move passed a block of data, that block can be
* freed by the garbage collector.</li>
* </ol>
*
* If you are familiar with the structure of our inference, you may initially think that this reader-chasing-writer
* garbage collection trick doesn't buy us much because we have a two-phase parser. However, when the inferencer has
* gotten to the last parser in its set of allowable parsers (say, the String parser), or the user has specified
* that there is only one parser for this column, then the code doesn't need to do any inference and can parse the
* column in one pass. (TODO(kosak): one-pass not implemented yet. Coming shortly).
* gotten to the last parser in its set of allowable parsers (say, the String parser), or the user has specified that
* there is only one parser for this column, then the code doesn't need to do any inference and can parse the column in
* one pass. (TODO(kosak): one-pass not implemented yet. Coming shortly).
*
* The implementation used here is to look at the "string" being added to the writer and categorize it along two
* dimensions:
* <ul>
* <li>Small vs large</li>
* <li>Byte vs char</li>
* <li>Small vs large</li>
* <li>Byte vs char</li>
* </ul>
*
* These dimensions are broken out in the following way:
* <li>Small byte "strings" are packed into a byte block, and we maintain a linked list of these byte blocks</li>
* <li>Small char "strings" are packed into a char block, and we maintain a linked list of these char blocks</li>
* <li>"Large" objects (byte or char sequences with length >= a threshold) are stored directly, meaning a byte[] or char[]
* array is allocated for their data, then a reference to that array is added to a byte-array or char-array block.
* (And again, we maintain a linked list of these byte-array or char-array blocks).
* It is not typical for CSV data to contain a cell this large, but the feature is there for completeness.
* We do not want want large "strings" to contaminate our byte and char blocks because they would not likely pack
* into them tightly. It's OK to keep them on their own because by definition, large "strings" are not going to have
* much overhead, as a percentage of their size.
* <li>"Large" objects (byte or char sequences with length >= a threshold) are stored directly, meaning a byte[] or
* char[] array is allocated for their data, then a reference to that array is added to a byte-array or char-array
* block. (And again, we maintain a linked list of these byte-array or char-array blocks). It is not typical for CSV
* data to contain a cell this large, but the feature is there for completeness. We do not want want large "strings" to
* contaminate our byte and char blocks because they would not likely pack into them tightly. It's OK to keep them on
* their own because by definition, large "strings" are not going to have much overhead, as a percentage of their size.
* </li>
* </ul>
*/
public final class DenseStorageWriter {
/**
* The ints in this array indicate where the next item is stored:
* <ul>
* <li>Integer.MIN_VALUE: largeStringWriter.</li>
* <li>Integer.MAX_VALUE: largeByteWriter.</li>
* <li>== 0: no bytes or characters, so they're not stored anywhere. Will be interpreted as a ByteSlice with
* arbitrary byte data and length 0.</li>
* <li>&lt; 0:charWriter (the number of chars is the negative of this value)</li>
* <li>&gt; 0:byteWriter (the number of chars is equal to this value)</li>
* <li></li>
* <li>Integer.MIN_VALUE: largeStringWriter.</li>
* <li>Integer.MAX_VALUE: largeByteWriter.</li>
* <li>== 0: no bytes or characters, so they're not stored anywhere. Will be interpreted as a ByteSlice with
* arbitrary byte data and length 0.</li>
* <li>&lt; 0:charWriter (the number of chars is the negative of this value)</li>
* <li>&gt; 0:byteWriter (the number of chars is equal to this value)</li>
* <li></li>
* </ul>
*/
private final QueueWriter.IntWriter controlWriter;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
package io.deephaven.csv.densestorage;

/**
* Linked list node that holds data for a {@link DenseStorageWriter} or {@link DenseStorageReader}.
* All fields are immutable except the "next" field. Synchronization for reading/writing the "next" field
* is managed by the {@link DenseStorageWriter} and {@link DenseStorageReader}.
* Linked list node that holds data for a {@link DenseStorageWriter} or {@link DenseStorageReader}. All fields are
* immutable except the "next" field. Synchronization for reading/writing the "next" field is managed by the
* {@link DenseStorageWriter} and {@link DenseStorageReader}.
*/
public final class QueueNode<TARRAY> {
public final TARRAY data;
Expand Down
Loading

0 comments on commit a4b6ca5

Please sign in to comment.