Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ignoreZeroDecimal to ReadOptions #748

Merged
merged 24 commits into from
Apr 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions core/src/main/java/tech/tablesaw/columns/numbers/IntParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,23 @@
import tech.tablesaw.api.ColumnType;
import tech.tablesaw.columns.AbstractColumnParser;
import tech.tablesaw.io.ReadOptions;
import tech.tablesaw.util.StringUtils;

public class IntParser extends AbstractColumnParser<Integer> {

private final boolean ignoreZeroDecimal;

public IntParser(ColumnType columnType) {
super(columnType);
ignoreZeroDecimal = ReadOptions.DEFAULT_IGNORE_ZERO_DECIMAL;
}

public IntParser(IntColumnType columnType, ReadOptions readOptions) {
super(columnType);
if (readOptions.missingValueIndicator() != null) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicator());
}
ignoreZeroDecimal = readOptions.ignoreZeroDecimal();
}

@Override
Expand All @@ -25,8 +30,8 @@ public boolean canParse(String str) {
}
String s = str;
try {
if (s.endsWith(".0")) {
s = s.substring(0, s.length() - 2);
if (ignoreZeroDecimal) {
s = StringUtils.removeZeroDecimal(s);
}
Integer.parseInt(AbstractColumnParser.remove(s, ','));
return true;
Expand All @@ -52,8 +57,8 @@ public int parseInt(String str) {
return IntColumnType.missingValueIndicator();
}
String s = str;
if (s.endsWith(".0")) {
s = s.substring(0, s.length() - 2);
if (ignoreZeroDecimal) {
s = StringUtils.removeZeroDecimal(s);
}
return Integer.parseInt(AbstractColumnParser.remove(s, ','));
}
Expand Down
13 changes: 9 additions & 4 deletions core/src/main/java/tech/tablesaw/columns/numbers/LongParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,23 @@
import tech.tablesaw.api.ColumnType;
import tech.tablesaw.columns.AbstractColumnParser;
import tech.tablesaw.io.ReadOptions;
import tech.tablesaw.util.StringUtils;

public class LongParser extends AbstractColumnParser<Long> {

private final boolean ignoreZeroDecimal;

public LongParser(ColumnType columnType) {
super(columnType);
ignoreZeroDecimal = ReadOptions.DEFAULT_IGNORE_ZERO_DECIMAL;
}

public LongParser(LongColumnType columnType, ReadOptions readOptions) {
super(columnType);
if (readOptions.missingValueIndicator() != null) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicator());
}
ignoreZeroDecimal = readOptions.ignoreZeroDecimal();
}

@Override
Expand All @@ -25,8 +30,8 @@ public boolean canParse(String str) {
}
String s = str;
try {
if (s.endsWith(".0")) {
s = s.substring(0, s.length() - 2);
if (ignoreZeroDecimal) {
s = StringUtils.removeZeroDecimal(s);
}
Long.parseLong(AbstractColumnParser.remove(s, ','));
return true;
Expand All @@ -52,8 +57,8 @@ public long parseLong(String str) {
return LongColumnType.missingValueIndicator();
}
String s = str;
if (s.endsWith(".0")) {
s = s.substring(0, s.length() - 2);
if (ignoreZeroDecimal) {
s = StringUtils.removeZeroDecimal(s);
}
return Long.parseLong(AbstractColumnParser.remove(s, ','));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,23 @@
import com.google.common.collect.Lists;
import tech.tablesaw.columns.AbstractColumnParser;
import tech.tablesaw.io.ReadOptions;
import tech.tablesaw.util.StringUtils;

public class ShortParser extends AbstractColumnParser<Short> {

private final boolean ignoreZeroDecimal;

public ShortParser(ShortColumnType columnType) {
super(columnType);
ignoreZeroDecimal = ReadOptions.DEFAULT_IGNORE_ZERO_DECIMAL;
}

public ShortParser(ShortColumnType columnType, ReadOptions readOptions) {
super(columnType);
if (readOptions.missingValueIndicator() != null) {
missingValueStrings = Lists.newArrayList(readOptions.missingValueIndicator());
}
ignoreZeroDecimal = readOptions.ignoreZeroDecimal();
}

@Override
Expand All @@ -24,8 +29,8 @@ public boolean canParse(String str) {
}
String s = str;
try {
if (s.endsWith(".0")) {
s = s.substring(0, s.length() - 2);
if (ignoreZeroDecimal) {
s = StringUtils.removeZeroDecimal(s);
}
Short.parseShort(AbstractColumnParser.remove(s, ','));
return true;
Expand All @@ -51,8 +56,8 @@ public short parseShort(String str) {
return ShortColumnType.missingValueIndicator();
}
String s = str;
if (s.endsWith(".0")) {
s = s.substring(0, s.length() - 2);
if (ignoreZeroDecimal) {
s = StringUtils.removeZeroDecimal(s);
}
return Short.parseShort(AbstractColumnParser.remove(s, ','));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ default DoubleColumn distance(Column<String> column2) {
* @param columns the column to append
* @return the new column
*/
default StringColumn join(String separator, Column... columns) {
default StringColumn join(String separator, Column<?>... columns) {
StringColumn newColumn = StringColumn.create(name() + "[column appended]", this.size());
for (int r = 0; r < size(); r++) {
StringBuilder result = new StringBuilder(getString(r));
Expand Down Expand Up @@ -347,11 +347,11 @@ default StringColumn concatenate(Object... stringsToAppend) {
* @param stringColumns the string columns to append
* @return the new column
*/
default StringColumn concatenate(Column... stringColumns) {
default StringColumn concatenate(Column<?>... stringColumns) {
StringColumn newColumn = StringColumn.create(name() + "[append]", this.size());
for (int r = 0; r < size(); r++) {
StringBuilder s = new StringBuilder(getString(r));
for (Column stringColumn : stringColumns) s.append(stringColumn.getString(r));
for (Column<?> stringColumn : stringColumns) s.append(stringColumn.getString(r));
newColumn.set(r, s.toString());
}
return newColumn;
Expand Down
15 changes: 15 additions & 0 deletions core/src/main/java/tech/tablesaw/io/ReadOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@

public class ReadOptions {

public static final boolean DEFAULT_IGNORE_ZERO_DECIMAL = true;

private static final List<ColumnType> DEFAULT_TYPES =
Lists.newArrayList(
LOCAL_DATE_TIME, LOCAL_TIME, LOCAL_DATE, BOOLEAN, INTEGER, LONG, DOUBLE, STRING);
Expand Down Expand Up @@ -75,6 +77,7 @@ public class ReadOptions {
protected final String missingValueIndicator;
protected final boolean minimizeColumnSizes;
protected final int maxCharsPerColumn;
protected final boolean ignoreZeroDecimal;

protected final DateTimeFormatter dateFormatter;
protected final DateTimeFormatter dateTimeFormatter;
Expand All @@ -94,6 +97,7 @@ protected ReadOptions(ReadOptions.Builder builder) {
minimizeColumnSizes = builder.minimizeColumnSizes;
header = builder.header;
maxCharsPerColumn = builder.maxCharsPerColumn;
ignoreZeroDecimal = builder.ignoreZeroDecimal;

dateFormatter = builder.dateFormatter;
timeFormatter = builder.timeFormatter;
Expand Down Expand Up @@ -138,6 +142,10 @@ public boolean header() {
return header;
}

public boolean ignoreZeroDecimal() {
return ignoreZeroDecimal;
}

public DateTimeFormatter dateTimeFormatter() {
if (dateTimeFormatter != null) {
return dateTimeFormatter;
Expand Down Expand Up @@ -186,6 +194,7 @@ protected static class Builder {
protected boolean minimizeColumnSizes = false;
protected boolean header = true;
protected int maxCharsPerColumn = 4096;
protected boolean ignoreZeroDecimal = DEFAULT_IGNORE_ZERO_DECIMAL;

protected Builder() {
source = null;
Expand Down Expand Up @@ -269,6 +278,12 @@ public Builder maxCharsPerColumn(int maxCharsPerColumn) {
return this;
}

/** Ignore zero value decimals in data values. Defaults to {@code true}. */
public Builder ignoreZeroDecimal(boolean ignoreZeroDecimal) {
this.ignoreZeroDecimal = ignoreZeroDecimal;
return this;
}

public Builder sample(boolean sample) {
this.sample = sample;
return this;
Expand Down
6 changes: 6 additions & 0 deletions core/src/main/java/tech/tablesaw/io/csv/CsvReadOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -302,5 +302,11 @@ public Builder minimizeColumnSizes() {
super.minimizeColumnSizes();
return this;
}

@Override
public Builder ignoreZeroDecimal(boolean ignoreZeroDecimal) {
super.ignoreZeroDecimal(ignoreZeroDecimal);
return this;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -294,5 +294,11 @@ public Builder minimizeColumnSizes() {
super.minimizeColumnSizes();
return this;
}

@Override
public Builder ignoreZeroDecimal(boolean ignoreZeroDecimal) {
super.ignoreZeroDecimal(ignoreZeroDecimal);
return this;
}
}
}
19 changes: 19 additions & 0 deletions core/src/main/java/tech/tablesaw/util/StringUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import com.google.common.base.Strings;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

/**
* Operations on {@link java.lang.String} that are {@code null} safe.
Expand Down Expand Up @@ -66,6 +67,8 @@ public class StringUtils {
/** The maximum size to which the padding constant(s) can expand. */
private static final int PAD_LIMIT = 8192;

private static final Pattern ZERO_DECIMAL_PATTERN = Pattern.compile("\\.0+$");

private StringUtils() {}

// Empty checks
Expand Down Expand Up @@ -520,6 +523,22 @@ public static boolean isAllUpperCase(final String cs) {
return true;
}

/**
* Removes all trailing zero decimals from the given String, assuming all decimals are zero and
* any zero decimals actually exist.
*
* <p>A {@code null} input String returns {@code null}.
*
* @param str the String to handle, may be null
* @return string without trailing zero decimals
*/
public static String removeZeroDecimal(final String str) {
if (Strings.isNullOrEmpty(str)) {
return str;
}
return ZERO_DECIMAL_PATTERN.matcher(str).replaceFirst(EMPTY);
}

// Abbreviating
// -----------------------------------------------------------------------
/**
Expand Down
26 changes: 26 additions & 0 deletions core/src/test/java/tech/tablesaw/io/csv/CsvReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,32 @@ public void testDataTypeDetection() throws IOException {
assertArrayEquals(bus_types, columnTypes);
}

@Test
public void testNumberTypeDetectionIgnoreZeroDecimal() throws IOException {
Reader reader = new FileReader("../data/immunization.csv");
CsvReadOptions options =
CsvReadOptions.builder(reader).header(true).sample(false).ignoreZeroDecimal(true).build();

// Column index 3 and 7 contain values with none to 3 zero values as suffix
// Should map to type INTEGER when ignoreZeroDecimal = true
ColumnType[] columnTypes = new CsvReader().detectColumnTypes(reader, options);
assertEquals(INTEGER, columnTypes[3]);
assertEquals(INTEGER, columnTypes[7]);
}

@Test
public void testNumberTypeDetectionRetainZeroDecimal() throws IOException {
Reader reader = new FileReader("../data/immunization.csv");
CsvReadOptions options =
CsvReadOptions.builder(reader).header(true).sample(false).ignoreZeroDecimal(false).build();

// Column index 3 and 7 contain values with none to 3 zero values as suffix
// Should map to type DOUBLE when ignoreZeroDecimal = false
ColumnType[] columnTypes = new CsvReader().detectColumnTypes(reader, options);
assertEquals(DOUBLE, columnTypes[3]);
assertEquals(DOUBLE, columnTypes[7]);
}

@Test
public void testMillis() {
long[] times = {1530486314124L, 1530488214124L};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
public class FixedWidthReaderTest {

private final FixedWidthFields car_fields_specs = new FixedWidthFields(4, 5, 40, 40, 8);
private final ColumnType[] car_types = {SHORT, STRING, STRING, STRING, FLOAT};
private final ColumnType[] car_types = {SHORT, STRING, STRING, STRING, SHORT};
private final ColumnType[] car_types_with_SKIP = {SHORT, STRING, STRING, SKIP, FLOAT};

@Test
Expand Down
43 changes: 43 additions & 0 deletions core/src/test/java/tech/tablesaw/util/StringUtilsTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package tech.tablesaw.util;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;

import org.junit.jupiter.api.Test;

public class StringUtilsTest {

@Test
public void testRemoveZeroDecimal() {
// Assert zero decimals being removed
assertEquals("246", StringUtils.removeZeroDecimal("246.0"));
assertEquals("146", StringUtils.removeZeroDecimal("146.00"));
assertEquals("357", StringUtils.removeZeroDecimal("357.000"));
assertEquals("347", StringUtils.removeZeroDecimal("347.0000"));

// Assert no change to input value
assertEquals("468", StringUtils.removeZeroDecimal("468"));
assertEquals("24", StringUtils.removeZeroDecimal("24"));
assertEquals("468.02", StringUtils.removeZeroDecimal("468.02"));
assertEquals("246.004", StringUtils.removeZeroDecimal("246.004"));
assertEquals("246.4000", StringUtils.removeZeroDecimal("246.4000"));

// Assert empty string and null handling
assertEquals("", StringUtils.removeZeroDecimal(""));
assertNull(StringUtils.removeZeroDecimal(null));
}
}
Loading