From 5c3ac8b375b9bfd0a2844534c3232f701c5d6230 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Tue, 19 May 2020 10:59:53 -0700 Subject: [PATCH] Improve LoadCsv to handle null values when deducing the column types (#2916) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Unit test to repro * Fix https://github.com/dotnet/corefxlab/issues/2915 Append a null value to a column when encountering it instead of changing the column type to a StringDataFrameColumn * Update src/Microsoft.Data.Analysis/DataFrame.IO.cs Co-authored-by: Günther Foidl * Update src/Microsoft.Data.Analysis/DataFrame.cs Co-authored-by: Günther Foidl * Feedback Co-authored-by: Günther Foidl --- src/Microsoft.Data.Analysis/DataFrame.IO.cs | 6 + src/Microsoft.Data.Analysis/DataFrame.cs | 47 +++--- .../DataFrame.IOTests.cs | 158 ++++++++++++++++++ 3 files changed, 191 insertions(+), 20 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index dd0752a0ae..084b66379d 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -23,6 +23,12 @@ private static Type GuessKind(int col, List read) throw new FormatException(string.Format(Strings.LessColumnsThatExpected, nbline + 1)); string val = line[col]; + + if (string.Equals(val, "null", StringComparison.OrdinalIgnoreCase)) + { + continue; + } + bool boolParse = bool.TryParse(val, out bool boolResult); if (boolParse) { diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index 3587e0b4e2..e62f49a2e4 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -463,35 +463,42 @@ public DataFrame Append(IEnumerable row = null, bool inPlace = false) bool columnMoveNext = columnEnumerator.MoveNext(); if (row != null) { - // Go through row first to make sure there are no data type incompatibilities - IEnumerator rowEnumerator = row.GetEnumerator(); - bool rowMoveNext = rowEnumerator.MoveNext(); - List cachedObjectConversions = new List(); - while (columnMoveNext && rowMoveNext) + // Go through row first to make sure there are no data type incompatibilities + IEnumerator rowEnumerator = row.GetEnumerator(); + bool rowMoveNext = rowEnumerator.MoveNext(); + List cachedObjectConversions = new List(); + while (columnMoveNext && rowMoveNext) + { + DataFrameColumn column = columnEnumerator.Current; + object value = rowEnumerator.Current; + // StringDataFrameColumn can accept empty strings. The other columns interpret empty values as nulls + if (value is string stringValue) { - DataFrameColumn column = columnEnumerator.Current; - object value = rowEnumerator.Current; - // StringDataFrameColumn can accept empty strings. The other columns interpret empty values as nulls - if (value is string stringValue && string.IsNullOrEmpty(stringValue) && column.DataType != typeof(string)) + if (stringValue.Length == 0 && column.DataType != typeof(string)) { value = null; } - if (value != null) + else if (stringValue.Equals("null", StringComparison.OrdinalIgnoreCase)) { - value = Convert.ChangeType(value, column.DataType); - if (value is null) - { - throw new ArgumentException(string.Format(Strings.MismatchedValueType, column.DataType), value.GetType().ToString()); - } + value = null; } - cachedObjectConversions.Add(value); - columnMoveNext = columnEnumerator.MoveNext(); - rowMoveNext = rowEnumerator.MoveNext(); } - if (rowMoveNext) + if (value != null) { - throw new ArgumentException(string.Format(Strings.ExceedsNumberOfColumns, Columns.Count), nameof(row)); + value = Convert.ChangeType(value, column.DataType); + if (value is null) + { + throw new ArgumentException(string.Format(Strings.MismatchedValueType, column.DataType), value.GetType().ToString()); + } } + cachedObjectConversions.Add(value); + columnMoveNext = columnEnumerator.MoveNext(); + rowMoveNext = rowEnumerator.MoveNext(); + } + if (rowMoveNext) + { + throw new ArgumentException(string.Format(Strings.ExceedsNumberOfColumns, Columns.Count), nameof(row)); + } // Reset the enumerators columnEnumerator = ret.Columns.GetEnumerator(); columnMoveNext = columnEnumerator.MoveNext(); diff --git a/tests/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/tests/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index cd49f02b68..9bed04638c 100644 --- a/tests/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/tests/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -445,5 +445,163 @@ Stream GetStream(string streamData) VerifyColumnTypes(df); } + + [Fact] + public void TestReadCsvWithAllNulls() + { + string data = @"vendor_id,rate_code,passenger_count,trip_time_in_secs +null,null,null,null +Null,Null,Null,Null +null,null,null,null +Null,Null,Null,Null +null,null,null,null +null,null,null,null"; + + Stream GetStream(string streamData) + { + return new MemoryStream(Encoding.Default.GetBytes(streamData)); + } + DataFrame df = DataFrame.LoadCsv(GetStream(data)); + Assert.Equal(6, df.Rows.Count); + Assert.Equal(4, df.Columns.Count); + + Assert.True(typeof(string) == df.Columns[0].DataType); + Assert.True(typeof(string) == df.Columns[1].DataType); + Assert.True(typeof(string) == df.Columns[2].DataType); + Assert.True(typeof(string) == df.Columns[3].DataType); + + Assert.Equal("vendor_id", df.Columns[0].Name); + Assert.Equal("rate_code", df.Columns[1].Name); + Assert.Equal("passenger_count", df.Columns[2].Name); + Assert.Equal("trip_time_in_secs", df.Columns[3].Name); + VerifyColumnTypes(df); + + foreach (var column in df.Columns) + { + Assert.Equal(6, column.NullCount); + foreach (var value in column) + { + Assert.Null(value); + } + } + } + + [Fact] + public void TestReadCsvWithNullsAndDataTypes() + { + string data = @"vendor_id,rate_code,passenger_count,trip_time_in_secs +null,1,1,1271 +CMT,Null,1,474 +CMT,1,null,637 +Null,,, +,,, +CMT,1,1,null"; + + Stream GetStream(string streamData) + { + return new MemoryStream(Encoding.Default.GetBytes(streamData)); + } + DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(short), typeof(int), typeof(long) }); + Assert.Equal(6, df.Rows.Count); + Assert.Equal(4, df.Columns.Count); + + Assert.True(typeof(string) == df.Columns[0].DataType); + Assert.True(typeof(short) == df.Columns[1].DataType); + Assert.True(typeof(int) == df.Columns[2].DataType); + Assert.True(typeof(long) == df.Columns[3].DataType); + + Assert.Equal("vendor_id", df.Columns[0].Name); + Assert.Equal("rate_code", df.Columns[1].Name); + Assert.Equal("passenger_count", df.Columns[2].Name); + Assert.Equal("trip_time_in_secs", df.Columns[3].Name); + VerifyColumnTypes(df); + + foreach (var column in df.Columns) + { + if (column.DataType != typeof(string)) + { + Assert.Equal(3, column.NullCount); + } + else + { + Assert.Equal(2, column.NullCount); + } + } + var nullRow = df.Rows[3]; + Assert.Null(nullRow[0]); + Assert.Null(nullRow[1]); + Assert.Null(nullRow[2]); + Assert.Null(nullRow[3]); + + nullRow = df.Rows[4]; + Assert.Equal("", nullRow[0]); + Assert.Null(nullRow[1]); + Assert.Null(nullRow[2]); + Assert.Null(nullRow[3]); + + Assert.Null(df[0, 0]); + Assert.Null(df[1, 1]); + Assert.Null(df[2, 2]); + Assert.Null(df[5, 3]); + } + + [Fact] + public void TestReadCsvWithNulls() + { + string data = @"vendor_id,rate_code,passenger_count,trip_time_in_secs +null,1,1,1271 +CMT,Null,1,474 +CMT,1,null,637 +Null,,, +,,, +CMT,1,1,null"; + + Stream GetStream(string streamData) + { + return new MemoryStream(Encoding.Default.GetBytes(streamData)); + } + DataFrame df = DataFrame.LoadCsv(GetStream(data)); + Assert.Equal(6, df.Rows.Count); + Assert.Equal(4, df.Columns.Count); + + Assert.True(typeof(string) == df.Columns[0].DataType); + Assert.True(typeof(float) == df.Columns[1].DataType); + Assert.True(typeof(float) == df.Columns[2].DataType); + Assert.True(typeof(float) == df.Columns[3].DataType); + + Assert.Equal("vendor_id", df.Columns[0].Name); + Assert.Equal("rate_code", df.Columns[1].Name); + Assert.Equal("passenger_count", df.Columns[2].Name); + Assert.Equal("trip_time_in_secs", df.Columns[3].Name); + VerifyColumnTypes(df); + + foreach (var column in df.Columns) + { + if (column.DataType != typeof(string)) + { + Assert.Equal(3, column.NullCount); + } + else + { + Assert.Equal(2, column.NullCount); + } + } + var nullRow = df.Rows[3]; + Assert.Null(nullRow[0]); + Assert.Null(nullRow[1]); + Assert.Null(nullRow[2]); + Assert.Null(nullRow[3]); + + nullRow = df.Rows[4]; + Assert.Equal("", nullRow[0]); + Assert.Null(nullRow[1]); + Assert.Null(nullRow[2]); + Assert.Null(nullRow[3]); + + Assert.Null(df[0, 0]); + Assert.Null(df[1, 1]); + Assert.Null(df[2, 2]); + Assert.Null(df[5, 3]); + } } }