Skip to content

Commit

Permalink
Improve LoadCsv to handle null values when deducing the column types (#…
Browse files Browse the repository at this point in the history
…2916)

* Unit test to repro

* Fix dotnet/corefxlab#2915

Append a null value to a column when encountering it instead of changing the column type to a StringDataFrameColumn

* Update src/Microsoft.Data.Analysis/DataFrame.IO.cs

Co-authored-by: Günther Foidl <gue@korporal.at>

* Update src/Microsoft.Data.Analysis/DataFrame.cs

Co-authored-by: Günther Foidl <gue@korporal.at>

* Feedback

Co-authored-by: Günther Foidl <gue@korporal.at>
  • Loading branch information
Prashanth Govindarajan and gfoidl authored May 19, 2020
1 parent 28140bd commit 5c3ac8b
Show file tree
Hide file tree
Showing 3 changed files with 191 additions and 20 deletions.
6 changes: 6 additions & 0 deletions src/Microsoft.Data.Analysis/DataFrame.IO.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ private static Type GuessKind(int col, List<string[]> read)
throw new FormatException(string.Format(Strings.LessColumnsThatExpected, nbline + 1));

string val = line[col];

if (string.Equals(val, "null", StringComparison.OrdinalIgnoreCase))
{
continue;
}

bool boolParse = bool.TryParse(val, out bool boolResult);
if (boolParse)
{
Expand Down
47 changes: 27 additions & 20 deletions src/Microsoft.Data.Analysis/DataFrame.cs
Original file line number Diff line number Diff line change
Expand Up @@ -463,35 +463,42 @@ public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false)
bool columnMoveNext = columnEnumerator.MoveNext();
if (row != null)
{
// Go through row first to make sure there are no data type incompatibilities
IEnumerator<object> rowEnumerator = row.GetEnumerator();
bool rowMoveNext = rowEnumerator.MoveNext();
List<object> cachedObjectConversions = new List<object>();
while (columnMoveNext && rowMoveNext)
// Go through row first to make sure there are no data type incompatibilities
IEnumerator<object> rowEnumerator = row.GetEnumerator();
bool rowMoveNext = rowEnumerator.MoveNext();
List<object> cachedObjectConversions = new List<object>();
while (columnMoveNext && rowMoveNext)
{
DataFrameColumn column = columnEnumerator.Current;
object value = rowEnumerator.Current;
// StringDataFrameColumn can accept empty strings. The other columns interpret empty values as nulls
if (value is string stringValue)
{
DataFrameColumn column = columnEnumerator.Current;
object value = rowEnumerator.Current;
// StringDataFrameColumn can accept empty strings. The other columns interpret empty values as nulls
if (value is string stringValue && string.IsNullOrEmpty(stringValue) && column.DataType != typeof(string))
if (stringValue.Length == 0 && column.DataType != typeof(string))
{
value = null;
}
if (value != null)
else if (stringValue.Equals("null", StringComparison.OrdinalIgnoreCase))
{
value = Convert.ChangeType(value, column.DataType);
if (value is null)
{
throw new ArgumentException(string.Format(Strings.MismatchedValueType, column.DataType), value.GetType().ToString());
}
value = null;
}
cachedObjectConversions.Add(value);
columnMoveNext = columnEnumerator.MoveNext();
rowMoveNext = rowEnumerator.MoveNext();
}
if (rowMoveNext)
if (value != null)
{
throw new ArgumentException(string.Format(Strings.ExceedsNumberOfColumns, Columns.Count), nameof(row));
value = Convert.ChangeType(value, column.DataType);
if (value is null)
{
throw new ArgumentException(string.Format(Strings.MismatchedValueType, column.DataType), value.GetType().ToString());
}
}
cachedObjectConversions.Add(value);
columnMoveNext = columnEnumerator.MoveNext();
rowMoveNext = rowEnumerator.MoveNext();
}
if (rowMoveNext)
{
throw new ArgumentException(string.Format(Strings.ExceedsNumberOfColumns, Columns.Count), nameof(row));
}
// Reset the enumerators
columnEnumerator = ret.Columns.GetEnumerator();
columnMoveNext = columnEnumerator.MoveNext();
Expand Down
158 changes: 158 additions & 0 deletions tests/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -445,5 +445,163 @@ Stream GetStream(string streamData)
VerifyColumnTypes(df);

}

[Fact]
public void TestReadCsvWithAllNulls()
{
string data = @"vendor_id,rate_code,passenger_count,trip_time_in_secs
null,null,null,null
Null,Null,Null,Null
null,null,null,null
Null,Null,Null,Null
null,null,null,null
null,null,null,null";

Stream GetStream(string streamData)
{
return new MemoryStream(Encoding.Default.GetBytes(streamData));
}
DataFrame df = DataFrame.LoadCsv(GetStream(data));
Assert.Equal(6, df.Rows.Count);
Assert.Equal(4, df.Columns.Count);

Assert.True(typeof(string) == df.Columns[0].DataType);
Assert.True(typeof(string) == df.Columns[1].DataType);
Assert.True(typeof(string) == df.Columns[2].DataType);
Assert.True(typeof(string) == df.Columns[3].DataType);

Assert.Equal("vendor_id", df.Columns[0].Name);
Assert.Equal("rate_code", df.Columns[1].Name);
Assert.Equal("passenger_count", df.Columns[2].Name);
Assert.Equal("trip_time_in_secs", df.Columns[3].Name);
VerifyColumnTypes(df);

foreach (var column in df.Columns)
{
Assert.Equal(6, column.NullCount);
foreach (var value in column)
{
Assert.Null(value);
}
}
}

[Fact]
public void TestReadCsvWithNullsAndDataTypes()
{
string data = @"vendor_id,rate_code,passenger_count,trip_time_in_secs
null,1,1,1271
CMT,Null,1,474
CMT,1,null,637
Null,,,
,,,
CMT,1,1,null";

Stream GetStream(string streamData)
{
return new MemoryStream(Encoding.Default.GetBytes(streamData));
}
DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(short), typeof(int), typeof(long) });
Assert.Equal(6, df.Rows.Count);
Assert.Equal(4, df.Columns.Count);

Assert.True(typeof(string) == df.Columns[0].DataType);
Assert.True(typeof(short) == df.Columns[1].DataType);
Assert.True(typeof(int) == df.Columns[2].DataType);
Assert.True(typeof(long) == df.Columns[3].DataType);

Assert.Equal("vendor_id", df.Columns[0].Name);
Assert.Equal("rate_code", df.Columns[1].Name);
Assert.Equal("passenger_count", df.Columns[2].Name);
Assert.Equal("trip_time_in_secs", df.Columns[3].Name);
VerifyColumnTypes(df);

foreach (var column in df.Columns)
{
if (column.DataType != typeof(string))
{
Assert.Equal(3, column.NullCount);
}
else
{
Assert.Equal(2, column.NullCount);
}
}
var nullRow = df.Rows[3];
Assert.Null(nullRow[0]);
Assert.Null(nullRow[1]);
Assert.Null(nullRow[2]);
Assert.Null(nullRow[3]);

nullRow = df.Rows[4];
Assert.Equal("", nullRow[0]);
Assert.Null(nullRow[1]);
Assert.Null(nullRow[2]);
Assert.Null(nullRow[3]);

Assert.Null(df[0, 0]);
Assert.Null(df[1, 1]);
Assert.Null(df[2, 2]);
Assert.Null(df[5, 3]);
}

[Fact]
public void TestReadCsvWithNulls()
{
string data = @"vendor_id,rate_code,passenger_count,trip_time_in_secs
null,1,1,1271
CMT,Null,1,474
CMT,1,null,637
Null,,,
,,,
CMT,1,1,null";

Stream GetStream(string streamData)
{
return new MemoryStream(Encoding.Default.GetBytes(streamData));
}
DataFrame df = DataFrame.LoadCsv(GetStream(data));
Assert.Equal(6, df.Rows.Count);
Assert.Equal(4, df.Columns.Count);

Assert.True(typeof(string) == df.Columns[0].DataType);
Assert.True(typeof(float) == df.Columns[1].DataType);
Assert.True(typeof(float) == df.Columns[2].DataType);
Assert.True(typeof(float) == df.Columns[3].DataType);

Assert.Equal("vendor_id", df.Columns[0].Name);
Assert.Equal("rate_code", df.Columns[1].Name);
Assert.Equal("passenger_count", df.Columns[2].Name);
Assert.Equal("trip_time_in_secs", df.Columns[3].Name);
VerifyColumnTypes(df);

foreach (var column in df.Columns)
{
if (column.DataType != typeof(string))
{
Assert.Equal(3, column.NullCount);
}
else
{
Assert.Equal(2, column.NullCount);
}
}
var nullRow = df.Rows[3];
Assert.Null(nullRow[0]);
Assert.Null(nullRow[1]);
Assert.Null(nullRow[2]);
Assert.Null(nullRow[3]);

nullRow = df.Rows[4];
Assert.Equal("", nullRow[0]);
Assert.Null(nullRow[1]);
Assert.Null(nullRow[2]);
Assert.Null(nullRow[3]);

Assert.Null(df[0, 0]);
Assert.Null(df[1, 1]);
Assert.Null(df[2, 2]);
Assert.Null(df[5, 3]);
}
}
}

0 comments on commit 5c3ac8b

Please sign in to comment.