From b982eed91c3b5a2bba94c177d735549d7576cc42 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Wed, 10 Mar 2021 13:06:20 -0800 Subject: [PATCH 1/9] IDataView -> DataFrame Implement the virtual function --- .../DataFrame.IDataView.cs | 2 +- .../DataFrameColumn.cs | 7 ++ .../IDataView.Extension.cs | 105 ++++++++++++++++++ .../PrimitiveDataFrameColumn.cs | 25 +++++ .../StringDataFrameColumn.cs | 24 ++++ .../DataFrameTests.IDataView.cs | 23 +++- 6 files changed, 181 insertions(+), 5 deletions(-) create mode 100644 src/Microsoft.Data.Analysis/IDataView.Extension.cs diff --git a/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs b/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs index 4755f296f4..59e457eb5f 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs @@ -11,7 +11,7 @@ namespace Microsoft.Data.Analysis { public partial class DataFrame : IDataView - { + { // TODO: support shuffling bool IDataView.CanShuffle => false; diff --git a/src/Microsoft.Data.Analysis/DataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumn.cs index c064ed8bda..14a3b20f1d 100644 --- a/src/Microsoft.Data.Analysis/DataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/DataFrameColumn.cs @@ -247,6 +247,13 @@ public virtual DataFrameColumn Sort(bool ascending = true) /// protected internal virtual void AddDataViewColumn(DataViewSchema.Builder builder) => throw new NotImplementedException(); + /// + /// Appends a value to this using + /// + /// The row cursor which has the current position + /// The in + protected internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) => throw new NotImplementedException(); + /// /// Clamps values beyond the specified thresholds /// diff --git a/src/Microsoft.Data.Analysis/IDataView.Extension.cs b/src/Microsoft.Data.Analysis/IDataView.Extension.cs new file mode 100644 index 0000000000..14956af43a --- /dev/null +++ b/src/Microsoft.Data.Analysis/IDataView.Extension.cs @@ -0,0 +1,105 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.ML; +using Microsoft.ML.Data; + +namespace Microsoft.Data.Analysis +{ + public static class IDataViewExtensions + { + public static DataFrame ToDataFrame(this IDataView dataView, long maxRows = -1, params string[] selectColumns) + { + DataViewSchema schema = dataView.Schema; + List columns = new List(schema.Count); + + HashSet selectColumnsSet = null; + if (selectColumns != null && selectColumns.Length > 0) + { + selectColumnsSet = new HashSet(selectColumns); + } + + List activeColumns = new List(); + foreach (DataViewSchema.Column column in schema) + { + long length = maxRows >= 0 ? maxRows : long.MaxValue; + length = Math.Min(length, dataView.GetRowCount() ?? 0); + if (column.IsHidden || (selectColumnsSet != null && !selectColumnsSet.Contains(column.Name))) + { + continue; + } + + activeColumns.Add(column); + DataViewType type = column.Type; + if (type == BooleanDataViewType.Instance) + { + columns.Add(new BooleanDataFrameColumn(column.Name, length)); + } + else if (type == NumberDataViewType.Byte) + { + columns.Add(new ByteDataFrameColumn(column.Name, length)); + } + else if (type == NumberDataViewType.Double) + { + columns.Add(new DoubleDataFrameColumn(column.Name, length)); + } + else if (type == NumberDataViewType.Single) + { + columns.Add(new SingleDataFrameColumn(column.Name, length)); + } + else if (type == NumberDataViewType.Int32) + { + columns.Add(new Int32DataFrameColumn(column.Name, length)); + } + else if (type == NumberDataViewType.Int64) + { + columns.Add(new Int64DataFrameColumn(column.Name, length)); + } + else if (type == NumberDataViewType.SByte) + { + columns.Add(new SByteDataFrameColumn(column.Name, length)); + } + else if (type == NumberDataViewType.Int16) + { + columns.Add(new Int16DataFrameColumn(column.Name, length)); + } + else if (type == NumberDataViewType.UInt32) + { + columns.Add(new UInt32DataFrameColumn(column.Name, length)); + } + else if (type == NumberDataViewType.UInt64) + { + columns.Add(new UInt64DataFrameColumn(column.Name, length)); + } + else if (type == NumberDataViewType.UInt16) + { + columns.Add(new UInt16DataFrameColumn(column.Name, length)); + } + else if (type == TextDataViewType.Instance) + { + columns.Add(new StringDataFrameColumn(column.Name, length)); + } + else + { + throw new NotSupportedException(nameof(type)); + } + } + + DataFrame ret = new DataFrame(columns); + DataViewRowCursor cursor = dataView.GetRowCursor(activeColumns); + while (cursor.MoveNext()) + { + foreach (var column in activeColumns) + { + columns[column.Index].AddValueUsingCursor(cursor, column); + } + } + + return ret; + } + } + +} diff --git a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs index 613644a346..8385873db8 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs @@ -775,5 +775,30 @@ private static ValueGetter CreateCharValueGetterDelegate(DataViewRowCurs private static ValueGetter CreateDecimalValueGetterDelegate(DataViewRowCursor cursor, PrimitiveDataFrameColumn column) => (ref double value) => value = (double?)column[cursor.Position] ?? double.NaN; + + private ValueGetter getter = null; + + protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column column) + { + long row = cursor.Position; + T value = default; + if (getter == null) + { + getter = cursor.GetGetter(column); + } + getter(ref value); + if (Length > row) + { + this[row] = value; + } + else if (Length == row) + { + Append(value); + } + else + { + throw new IndexOutOfRangeException(nameof(row)); + } + } } } diff --git a/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs b/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs index 92f2be029e..34239e18f0 100644 --- a/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs @@ -467,5 +467,29 @@ protected internal override Delegate GetDataViewGetter(DataViewRowCursor cursor) private ValueGetter> CreateValueGetterDelegate(DataViewRowCursor cursor) => (ref ReadOnlyMemory value) => value = this[cursor.Position].AsMemory(); + + private ValueGetter> getter = null; + protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) + { + long row = cursor.Position; + ReadOnlyMemory value = default; + if (getter == null) + { + getter = cursor.GetGetter>(schemaColumn); + } + getter(ref value); + if (Length > row) + { + this[row] = value.ToString(); + } + else if (Length == row) + { + Append(value.ToString()); + } + else + { + throw new IndexOutOfRangeException(nameof(row)); + } + } } } diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs index 9ed4963b7f..84826ca40f 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs @@ -10,12 +10,12 @@ namespace Microsoft.Data.Analysis.Tests { - public partial class DataFrameTests + public partial class DataFrameIDataViewTests { [Fact] public void TestIDataView() { - IDataView dataView = MakeDataFrameWithAllColumnTypes(10, withNulls: false); + IDataView dataView = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false); DataDebuggerPreview preview = dataView.Preview(); Assert.Equal(10, preview.RowView.Length); @@ -85,7 +85,7 @@ public void TestIDataView() [Fact] public void TestIDataViewSchemaInvalidate() { - DataFrame df = MakeDataFrameWithAllMutableColumnTypes(10, withNulls: false); + DataFrame df = DataFrameTests.MakeDataFrameWithAllMutableColumnTypes(10, withNulls: false); IDataView dataView = df; @@ -113,7 +113,7 @@ public void TestIDataViewSchemaInvalidate() public void TestIDataViewWithNulls() { int length = 10; - IDataView dataView = MakeDataFrameWithAllColumnTypes(length, withNulls: true); + IDataView dataView = DataFrameTests.MakeDataFrameWithAllColumnTypes(length, withNulls: true); DataDebuggerPreview preview = dataView.Preview(); Assert.Equal(length, preview.RowView.Length); @@ -224,5 +224,20 @@ public void TestIDataViewWithNulls() Assert.Equal("", preview.ColumnView[14].Values[5].ToString()); // null row Assert.Equal("foo", preview.ColumnView[14].Values[6].ToString()); } + + [Fact] + public void TestDataFrameFromIDataView() + { + DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false); + df.Columns.Remove("Char"); // Because chars are returned as uint16 by IDataView, so end up comparing CharDataFrameColumn to UInt16DataFrameColumn and fail asserts + IDataView dfAsIDataView = df; + DataFrame newDf = dfAsIDataView.ToDataFrame(); + Assert.Equal(dfAsIDataView.GetRowCount(), newDf.Rows.Count); + Assert.Equal(dfAsIDataView.Schema.Count, newDf.Columns.Count); + for (int i = 0; i < df.Columns.Count; i++) + { + Assert.True(df.Columns[i].ElementwiseEquals(newDf.Columns[i]).All()); + } + } } } From a8f6ad7f124d531b6c3d6be1e0038513c34762fb Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Fri, 12 Mar 2021 13:04:52 -0800 Subject: [PATCH 2/9] More APIs and unit tests --- .../IDataView.Extension.cs | 16 ++++++++--- .../StringDataFrameColumn.cs | 1 + .../DataFrameTests.IDataView.cs | 27 +++++++++++++++++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.Data.Analysis/IDataView.Extension.cs b/src/Microsoft.Data.Analysis/IDataView.Extension.cs index 14956af43a..fa0765e560 100644 --- a/src/Microsoft.Data.Analysis/IDataView.Extension.cs +++ b/src/Microsoft.Data.Analysis/IDataView.Extension.cs @@ -11,7 +11,17 @@ namespace Microsoft.Data.Analysis { public static class IDataViewExtensions { - public static DataFrame ToDataFrame(this IDataView dataView, long maxRows = -1, params string[] selectColumns) + public static DataFrame ToDataFrame(this IDataView dataView, long maxRows = -1) + { + return ToDataFrame(dataView, maxRows, null); + } + + public static DataFrame ToDataFrame(this IDataView dataView, params string[] selectColumns) + { + return ToDataFrame(dataView, -1, selectColumns); + } + + public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, params string[] selectColumns) { DataViewSchema schema = dataView.Schema; List columns = new List(schema.Count); @@ -92,9 +102,9 @@ public static DataFrame ToDataFrame(this IDataView dataView, long maxRows = -1, DataViewRowCursor cursor = dataView.GetRowCursor(activeColumns); while (cursor.MoveNext()) { - foreach (var column in activeColumns) + foreach (DataViewSchema.Column column in activeColumns) { - columns[column.Index].AddValueUsingCursor(cursor, column); + ret[column.Name].AddValueUsingCursor(cursor, column); } } diff --git a/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs b/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs index 34239e18f0..d6ae8540d8 100644 --- a/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs @@ -469,6 +469,7 @@ private ValueGetter> CreateValueGetterDelegate(DataViewRowC (ref ReadOnlyMemory value) => value = this[cursor.Position].AsMemory(); private ValueGetter> getter = null; + protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) { long row = cursor.Position; diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs index 84826ca40f..2e7c6db2f9 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs @@ -239,5 +239,32 @@ public void TestDataFrameFromIDataView() Assert.True(df.Columns[i].ElementwiseEquals(newDf.Columns[i]).All()); } } + + [Fact] + public void TestDataFrameFromIDataView_SelectColumns() + { + DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false); + IDataView dfAsIDataView = df; + DataFrame newDf = dfAsIDataView.ToDataFrame("Int", "Double"); + Assert.Equal(dfAsIDataView.GetRowCount(), newDf.Rows.Count); + Assert.Equal(2, newDf.Columns.Count); + Assert.True(df.Columns["Int"].ElementwiseEquals(newDf.Columns["Int"]).All()); + Assert.True(df.Columns["Double"].ElementwiseEquals(newDf.Columns["Double"]).All()); + } + + [Fact] + public void TestDataFrameFromIDataView_SelectRows() + { + DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false); + df.Columns.Remove("Char"); // Because chars are returned as uint16 by IDataView, so end up comparing CharDataFrameColumn to UInt16DataFrameColumn and fail asserts + IDataView dfAsIDataView = df; + DataFrame newDf = dfAsIDataView.ToDataFrame(5); + Assert.Equal(5, newDf.Rows.Count); + Assert.Equal(dfAsIDataView.Schema.Count, newDf.Columns.Count); + for (int i = 0; i < df.Columns.Count; i++) + { + Assert.True(df.Columns[i].ElementwiseEquals(newDf.Columns[i]).All()); + } + } } } From cb3c28d2b6775f2cf8465599ba9ff1a91c0199c1 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Fri, 12 Mar 2021 13:08:57 -0800 Subject: [PATCH 3/9] ANother unit test --- .../DataFrameTests.IDataView.cs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs index 2e7c6db2f9..8a0987929c 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs @@ -266,5 +266,17 @@ public void TestDataFrameFromIDataView_SelectRows() Assert.True(df.Columns[i].ElementwiseEquals(newDf.Columns[i]).All()); } } + + [Fact] + public void TestDataFrameFromIDataView_SelectColumnsAndRows() + { + DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false); + IDataView dfAsIDataView = df; + DataFrame newDf = dfAsIDataView.ToDataFrame(5, "Int", "Double"); + Assert.Equal(5, newDf.Rows.Count); + Assert.Equal(2, newDf.Columns.Count); + Assert.True(df.Columns["Int"].ElementwiseEquals(newDf.Columns["Int"]).All()); + Assert.True(df.Columns["Double"].ElementwiseEquals(newDf.Columns["Double"]).All()); + } } } From 90cc62cf43ec20029ccc2328b3bf5337af1d3caf Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Tue, 16 Mar 2021 13:35:58 -0700 Subject: [PATCH 4/9] Address feedback --- .../DataFrame.IDataView.cs | 78 +-------------- .../DataFrameColumn.cs | 10 +- .../IDataView.Extension.cs | 27 +++-- .../PrimitiveDataFrameColumn.cs | 17 ++-- src/Microsoft.Data.Analysis/RowCursor.cs | 98 +++++++++++++++++++ .../StringDataFrameColumn.cs | 17 ++-- .../strings.Designer.cs | 9 ++ src/Microsoft.Data.Analysis/strings.resx | 5 +- ...DataView.cs => DataFrameIDataViewTests.cs} | 81 ++++++++++++++- 9 files changed, 236 insertions(+), 106 deletions(-) create mode 100644 src/Microsoft.Data.Analysis/RowCursor.cs rename test/Microsoft.Data.Analysis.Tests/{DataFrameTests.IDataView.cs => DataFrameIDataViewTests.cs} (81%) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs b/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs index 59e457eb5f..12a3b4b487 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs @@ -4,9 +4,7 @@ using System; using System.Collections.Generic; -using System.Diagnostics; using Microsoft.ML; -using Microsoft.ML.Data; namespace Microsoft.Data.Analysis { @@ -16,7 +14,7 @@ public partial class DataFrame : IDataView bool IDataView.CanShuffle => false; private DataViewSchema _schema; - private DataViewSchema DataViewSchema + internal DataViewSchema DataViewSchema { get { @@ -53,6 +51,7 @@ private DataViewRowCursor GetRowCursorCore(IEnumerable co return new RowCursor(this, activeColumns); } + DataViewRowCursor IDataView.GetRowCursor(IEnumerable columnsNeeded, Random rand) { return GetRowCursorCore(columnsNeeded); @@ -63,78 +62,5 @@ DataViewRowCursor[] IDataView.GetRowCursorSet(IEnumerable // TODO: change to support parallel cursors return new DataViewRowCursor[] { GetRowCursorCore(columnsNeeded) }; } - - private sealed class RowCursor : DataViewRowCursor - { - private bool _disposed; - private long _position; - private readonly DataFrame _dataFrame; - private readonly Delegate[] _getters; - - public RowCursor(DataFrame dataFrame, bool[] activeColumns) - { - Debug.Assert(dataFrame != null); - Debug.Assert(activeColumns != null); - - _position = -1; - _dataFrame = dataFrame; - _getters = new Delegate[Schema.Count]; - for (int i = 0; i < _getters.Length; i++) - { - if (!activeColumns[i]) - continue; - _getters[i] = CreateGetterDelegate(i); - Debug.Assert(_getters[i] != null); - } - } - - public override long Position => _position; - public override long Batch => 0; - public override DataViewSchema Schema => _dataFrame.DataViewSchema; - - protected override void Dispose(bool disposing) - { - if (_disposed) - return; - if (disposing) - { - _position = -1; - } - _disposed = true; - base.Dispose(disposing); - } - - private Delegate CreateGetterDelegate(int col) - { - DataFrameColumn column = _dataFrame.Columns[col]; - return column.GetDataViewGetter(this); - } - - public override ValueGetter GetGetter(DataViewSchema.Column column) - { - if (!IsColumnActive(column)) - throw new ArgumentOutOfRangeException(nameof(column)); - - return (ValueGetter)_getters[column.Index]; - } - - public override ValueGetter GetIdGetter() - { - return (ref DataViewRowId value) => value = new DataViewRowId((ulong)_position, 0); - } - - public override bool IsColumnActive(DataViewSchema.Column column) - { - return _getters[column.Index] != null; - } - - public override bool MoveNext() - { - if (_disposed) - return false; - _position++; - return _position < _dataFrame.Rows.Count; - } - } } } diff --git a/src/Microsoft.Data.Analysis/DataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumn.cs index 14a3b20f1d..958ee9dd36 100644 --- a/src/Microsoft.Data.Analysis/DataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/DataFrameColumn.cs @@ -252,7 +252,15 @@ public virtual DataFrameColumn Sort(bool ascending = true) /// /// The row cursor which has the current position /// The in - protected internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) => throw new NotImplementedException(); + /// The cached ValueGetter for this column. + internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn, Delegate ValueGetter) => throw new NotImplementedException(); + + /// + /// Returns the ValueGetter for each active column in as a delegate to be cached. + /// + /// The row cursor which has the current position + /// The in + internal virtual Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) => throw new NotImplementedException(); /// /// Clamps values beyond the specified thresholds diff --git a/src/Microsoft.Data.Analysis/IDataView.Extension.cs b/src/Microsoft.Data.Analysis/IDataView.Extension.cs index fa0765e560..b407a0760f 100644 --- a/src/Microsoft.Data.Analysis/IDataView.Extension.cs +++ b/src/Microsoft.Data.Analysis/IDataView.Extension.cs @@ -4,21 +4,23 @@ using System; using System.Collections.Generic; -using Microsoft.ML; +using Microsoft.Data.Analysis; using Microsoft.ML.Data; -namespace Microsoft.Data.Analysis +namespace Microsoft.ML { public static class IDataViewExtensions { - public static DataFrame ToDataFrame(this IDataView dataView, long maxRows = -1) + private const int defaultMaxRows = 100; + + public static DataFrame ToDataFrame(this IDataView dataView, long maxRows = defaultMaxRows) { return ToDataFrame(dataView, maxRows, null); } public static DataFrame ToDataFrame(this IDataView dataView, params string[] selectColumns) { - return ToDataFrame(dataView, -1, selectColumns); + return ToDataFrame(dataView, defaultMaxRows, selectColumns); } public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, params string[] selectColumns) @@ -94,17 +96,28 @@ public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, param } else { - throw new NotSupportedException(nameof(type)); + throw new NotSupportedException(String.Format(Microsoft.Data.Strings.NotSupportedColumnType, type.RawType.Name)); } } + List activeColumnDelegates = new List(); + DataFrame ret = new DataFrame(columns); DataViewRowCursor cursor = dataView.GetRowCursor(activeColumns); - while (cursor.MoveNext()) + int columnIndex = 0; + foreach (DataViewSchema.Column column in activeColumns) + { + Delegate valueGetter = columns[columnIndex].GetValueGetterUsingCursor(cursor, column); + activeColumnDelegates.Add(valueGetter); + columnIndex++; + } + while (cursor.MoveNext() && cursor.Position < maxRows) { + columnIndex = 0; foreach (DataViewSchema.Column column in activeColumns) { - ret[column.Name].AddValueUsingCursor(cursor, column); + columns[columnIndex].AddValueUsingCursor(cursor, column, activeColumnDelegates[columnIndex]); + columnIndex++; } } diff --git a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs index 8385873db8..7b32fd25da 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs @@ -776,17 +776,13 @@ private static ValueGetter CreateCharValueGetterDelegate(DataViewRowCurs private static ValueGetter CreateDecimalValueGetterDelegate(DataViewRowCursor cursor, PrimitiveDataFrameColumn column) => (ref double value) => value = (double?)column[cursor.Position] ?? double.NaN; - private ValueGetter getter = null; - - protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column column) + internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column column, Delegate getter) { long row = cursor.Position; T value = default; - if (getter == null) - { - getter = cursor.GetGetter(column); - } - getter(ref value); + Debug.Assert(getter != null, "Excepted getter to be valid"); + (getter as ValueGetter)(ref value); + if (Length > row) { this[row] = value; @@ -800,5 +796,10 @@ protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, D throw new IndexOutOfRangeException(nameof(row)); } } + + internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) + { + return cursor.GetGetter(schemaColumn); + } } } diff --git a/src/Microsoft.Data.Analysis/RowCursor.cs b/src/Microsoft.Data.Analysis/RowCursor.cs new file mode 100644 index 0000000000..feb3e7ec9f --- /dev/null +++ b/src/Microsoft.Data.Analysis/RowCursor.cs @@ -0,0 +1,98 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Diagnostics; +using Microsoft.ML; +using Microsoft.ML.Data; + +namespace Microsoft.Data.Analysis +{ + internal sealed class RowCursor : DataViewRowCursor + { + private bool _disposed; + private long _position; + private readonly DataFrame _dataFrame; + internal readonly List _getters; + private Dictionary _columnIndexToGetterIndex; + + public RowCursor(DataFrame dataFrame, bool[] activeColumns) + { + Debug.Assert(dataFrame != null); + Debug.Assert(activeColumns != null); + + _columnIndexToGetterIndex = new Dictionary(); + _position = -1; + _dataFrame = dataFrame; + _getters = new List(); + for (int i = 0; i < Schema.Count; i++) + { + if (!activeColumns[i]) + { + continue; + } + + Delegate getter = CreateGetterDelegate(i); + _getters.Add(getter); + Debug.Assert(getter != null); + _columnIndexToGetterIndex[i] = _getters.Count - 1; + } + } + + public override long Position => _position; + public override long Batch => 0; + public override DataViewSchema Schema => _dataFrame.DataViewSchema; + + protected override void Dispose(bool disposing) + { + if (_disposed) + { + return; + } + + if (disposing) + { + _position = -1; + } + + _disposed = true; + base.Dispose(disposing); + } + + private Delegate CreateGetterDelegate(int col) + { + DataFrameColumn column = _dataFrame.Columns[col]; + return column.GetDataViewGetter(this); + } + + public override ValueGetter GetGetter(DataViewSchema.Column column) + { + if (!IsColumnActive(column)) + throw new ArgumentOutOfRangeException(nameof(column)); + + return (ValueGetter)_getters[_columnIndexToGetterIndex[column.Index]]; + } + + public override ValueGetter GetIdGetter() + { + return (ref DataViewRowId value) => value = new DataViewRowId((ulong)_position, 0); + } + + public override bool IsColumnActive(DataViewSchema.Column column) + { + return _getters[_columnIndexToGetterIndex[column.Index]] != null; + } + + public override bool MoveNext() + { + if (_disposed) + { + return false; + } + _position++; + return _position < _dataFrame.Rows.Count; + } + } +} diff --git a/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs b/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs index d6ae8540d8..4b1b42ce13 100644 --- a/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs @@ -468,17 +468,14 @@ protected internal override Delegate GetDataViewGetter(DataViewRowCursor cursor) private ValueGetter> CreateValueGetterDelegate(DataViewRowCursor cursor) => (ref ReadOnlyMemory value) => value = this[cursor.Position].AsMemory(); - private ValueGetter> getter = null; - - protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) + internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn, Delegate getter) { long row = cursor.Position; ReadOnlyMemory value = default; - if (getter == null) - { - getter = cursor.GetGetter>(schemaColumn); - } - getter(ref value); + Debug.Assert(getter != null, "Excepted getter to be valid"); + + (getter as ValueGetter>)(ref value); + if (Length > row) { this[row] = value.ToString(); @@ -492,5 +489,9 @@ protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, D throw new IndexOutOfRangeException(nameof(row)); } } + internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) + { + return cursor.GetGetter>(schemaColumn); + } } } diff --git a/src/Microsoft.Data.Analysis/strings.Designer.cs b/src/Microsoft.Data.Analysis/strings.Designer.cs index 030a79a8a3..fc64940869 100644 --- a/src/Microsoft.Data.Analysis/strings.Designer.cs +++ b/src/Microsoft.Data.Analysis/strings.Designer.cs @@ -258,6 +258,15 @@ internal static string NonSeekableStream { } } + /// + /// Looks up a localized string similar to {0} is not a supported column type.. + /// + internal static string NotSupportedColumnType { + get { + return ResourceManager.GetString("NotSupportedColumnType", resourceCulture); + } + } + /// /// Looks up a localized string similar to numeric column. /// diff --git a/src/Microsoft.Data.Analysis/strings.resx b/src/Microsoft.Data.Analysis/strings.resx index 267140834a..ad9f114050 100644 --- a/src/Microsoft.Data.Analysis/strings.resx +++ b/src/Microsoft.Data.Analysis/strings.resx @@ -183,10 +183,13 @@ Expected a seekable stream + + {0} is not a supported column type. + numeric column Cannot span multiple buffers - + \ No newline at end of file diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameIDataViewTests.cs similarity index 81% rename from test/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs rename to test/Microsoft.Data.Analysis.Tests/DataFrameIDataViewTests.cs index 8a0987929c..1c2285f57b 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameIDataViewTests.cs @@ -256,14 +256,24 @@ public void TestDataFrameFromIDataView_SelectColumns() public void TestDataFrameFromIDataView_SelectRows() { DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false); - df.Columns.Remove("Char"); // Because chars are returned as uint16 by IDataView, so end up comparing CharDataFrameColumn to UInt16DataFrameColumn and fail asserts + df.Columns.Remove("Char"); // Because chars are returned as uint16 by DataViewSchema, so end up comparing CharDataFrameColumn to UInt16DataFrameColumn and fail asserts + df.Columns.Remove("Decimal"); // Because decimal is returned as double by DataViewSchema, so end up comparing DecimalDataFrameColumn to DoubleDataFrameColumn and fail asserts IDataView dfAsIDataView = df; DataFrame newDf = dfAsIDataView.ToDataFrame(5); Assert.Equal(5, newDf.Rows.Count); + Assert.Equal(df.Columns.Count, newDf.Columns.Count); + for (int i = 0; i < newDf.Columns.Count; i++) + { + Assert.Equal(5, newDf.Columns[i].Length); + Assert.Equal(df.Columns[i].Name, newDf.Columns[i].Name); + } Assert.Equal(dfAsIDataView.Schema.Count, newDf.Columns.Count); - for (int i = 0; i < df.Columns.Count; i++) + for (int c = 0; c < df.Columns.Count; c++) { - Assert.True(df.Columns[i].ElementwiseEquals(newDf.Columns[i]).All()); + for (int r = 0; r < 5; r++) + { + Assert.Equal(df.Columns[c][r], newDf.Columns[c][r]); + } } } @@ -274,9 +284,70 @@ public void TestDataFrameFromIDataView_SelectColumnsAndRows() IDataView dfAsIDataView = df; DataFrame newDf = dfAsIDataView.ToDataFrame(5, "Int", "Double"); Assert.Equal(5, newDf.Rows.Count); + for (int i = 0; i < newDf.Columns.Count; i++) + { + Assert.Equal(5, newDf.Columns[i].Length); + } Assert.Equal(2, newDf.Columns.Count); - Assert.True(df.Columns["Int"].ElementwiseEquals(newDf.Columns["Int"]).All()); - Assert.True(df.Columns["Double"].ElementwiseEquals(newDf.Columns["Double"]).All()); + for (int r = 0; r < 5; r++) + { + Assert.Equal(df.Columns["Int"][r], newDf.Columns["Int"][r]); + Assert.Equal(df.Columns["Double"][r], newDf.Columns["Double"][r]); + } + } + + private class InputData + { + public string Name { get; set; } + public bool FilterNext { get; set; } + public float Value { get; set; } + } + + private IDataView GetASampleIDataView() + { + var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable. + var enumerableOfData = new[] + { + new InputData() { Name = "Joey", FilterNext = false, Value = 1.0f }, + new InputData() { Name = "Chandler", FilterNext = false , Value = 2.0f}, + new InputData() { Name = "Ross", FilterNext = false , Value = 3.0f}, + new InputData() { Name = "Monica", FilterNext = true , Value = 4.0f}, + new InputData() { Name = "Rachel", FilterNext = true , Value = 5.0f}, + new InputData() { Name = "Phoebe", FilterNext = false , Value = 6.0f}, + }; + + IDataView data = mlContext.Data.LoadFromEnumerable(enumerableOfData); + return data; } + + [Fact] + public void TestDataFrameFromIDataView_MLData() + { + IDataView data = GetASampleIDataView(); + DataFrame df = data.ToDataFrame(); + Assert.Equal(6, df.Rows.Count); + Assert.Equal(3, df.Columns.Count); + foreach (var column in df.Columns) + { + Assert.Equal(6, column.Length); + } + + void VerifyDataFrameColumnAndDataViewColumnValues(string columnName) + { + int cc = 0; + var nameDataViewColumn = data.GetColumn(columnName); + foreach (var value in nameDataViewColumn) + { + Assert.Equal(value, df.Columns[columnName][cc++]); + } + } + + VerifyDataFrameColumnAndDataViewColumnValues("Name"); + VerifyDataFrameColumnAndDataViewColumnValues("FilterNext"); + VerifyDataFrameColumnAndDataViewColumnValues("Value"); + } + } } From 1260e2267a33419d34f33bb4bc6999fab558d737 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Tue, 16 Mar 2021 13:53:54 -0700 Subject: [PATCH 5/9] Last bit of feedback --- .../IDataView.Extension.cs | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/Microsoft.Data.Analysis/IDataView.Extension.cs b/src/Microsoft.Data.Analysis/IDataView.Extension.cs index b407a0760f..23205bc294 100644 --- a/src/Microsoft.Data.Analysis/IDataView.Extension.cs +++ b/src/Microsoft.Data.Analysis/IDataView.Extension.cs @@ -37,8 +37,6 @@ public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, param List activeColumns = new List(); foreach (DataViewSchema.Column column in schema) { - long length = maxRows >= 0 ? maxRows : long.MaxValue; - length = Math.Min(length, dataView.GetRowCount() ?? 0); if (column.IsHidden || (selectColumnsSet != null && !selectColumnsSet.Contains(column.Name))) { continue; @@ -48,51 +46,51 @@ public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, param DataViewType type = column.Type; if (type == BooleanDataViewType.Instance) { - columns.Add(new BooleanDataFrameColumn(column.Name, length)); + columns.Add(new BooleanDataFrameColumn(column.Name)); } else if (type == NumberDataViewType.Byte) { - columns.Add(new ByteDataFrameColumn(column.Name, length)); + columns.Add(new ByteDataFrameColumn(column.Name)); } else if (type == NumberDataViewType.Double) { - columns.Add(new DoubleDataFrameColumn(column.Name, length)); + columns.Add(new DoubleDataFrameColumn(column.Name)); } else if (type == NumberDataViewType.Single) { - columns.Add(new SingleDataFrameColumn(column.Name, length)); + columns.Add(new SingleDataFrameColumn(column.Name)); } else if (type == NumberDataViewType.Int32) { - columns.Add(new Int32DataFrameColumn(column.Name, length)); + columns.Add(new Int32DataFrameColumn(column.Name)); } else if (type == NumberDataViewType.Int64) { - columns.Add(new Int64DataFrameColumn(column.Name, length)); + columns.Add(new Int64DataFrameColumn(column.Name)); } else if (type == NumberDataViewType.SByte) { - columns.Add(new SByteDataFrameColumn(column.Name, length)); + columns.Add(new SByteDataFrameColumn(column.Name)); } else if (type == NumberDataViewType.Int16) { - columns.Add(new Int16DataFrameColumn(column.Name, length)); + columns.Add(new Int16DataFrameColumn(column.Name)); } else if (type == NumberDataViewType.UInt32) { - columns.Add(new UInt32DataFrameColumn(column.Name, length)); + columns.Add(new UInt32DataFrameColumn(column.Name)); } else if (type == NumberDataViewType.UInt64) { - columns.Add(new UInt64DataFrameColumn(column.Name, length)); + columns.Add(new UInt64DataFrameColumn(column.Name)); } else if (type == NumberDataViewType.UInt16) { - columns.Add(new UInt16DataFrameColumn(column.Name, length)); + columns.Add(new UInt16DataFrameColumn(column.Name)); } else if (type == TextDataViewType.Instance) { - columns.Add(new StringDataFrameColumn(column.Name, length)); + columns.Add(new StringDataFrameColumn(column.Name)); } else { @@ -102,7 +100,6 @@ public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, param List activeColumnDelegates = new List(); - DataFrame ret = new DataFrame(columns); DataViewRowCursor cursor = dataView.GetRowCursor(activeColumns); int columnIndex = 0; foreach (DataViewSchema.Column column in activeColumns) @@ -121,7 +118,7 @@ public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, param } } - return ret; + return new DataFrame(columns); } } From 96bb44a3f17f34032f9c4c4242c4283acfe70a41 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Tue, 16 Mar 2021 14:18:38 -0700 Subject: [PATCH 6/9] Fix some stuff and unit tests --- .../DataFrameColumn.cs | 4 +- .../IDataView.Extension.cs | 27 +++---- .../PrimitiveDataFrameColumn.cs | 4 +- .../StringDataFrameColumn.cs | 4 +- .../DataFrameIDataViewTests.cs | 74 ++++++++++++++++--- 5 files changed, 84 insertions(+), 29 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumn.cs index 958ee9dd36..346dd4f242 100644 --- a/src/Microsoft.Data.Analysis/DataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/DataFrameColumn.cs @@ -253,14 +253,14 @@ public virtual DataFrameColumn Sort(bool ascending = true) /// The row cursor which has the current position /// The in /// The cached ValueGetter for this column. - internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn, Delegate ValueGetter) => throw new NotImplementedException(); + protected internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn, Delegate ValueGetter) => throw new NotImplementedException(); /// /// Returns the ValueGetter for each active column in as a delegate to be cached. /// /// The row cursor which has the current position /// The in - internal virtual Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) => throw new NotImplementedException(); + protected internal virtual Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) => throw new NotImplementedException(); /// /// Clamps values beyond the specified thresholds diff --git a/src/Microsoft.Data.Analysis/IDataView.Extension.cs b/src/Microsoft.Data.Analysis/IDataView.Extension.cs index 23205bc294..527c1b3285 100644 --- a/src/Microsoft.Data.Analysis/IDataView.Extension.cs +++ b/src/Microsoft.Data.Analysis/IDataView.Extension.cs @@ -98,24 +98,25 @@ public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, param } } - List activeColumnDelegates = new List(); - - DataViewRowCursor cursor = dataView.GetRowCursor(activeColumns); - int columnIndex = 0; - foreach (DataViewSchema.Column column in activeColumns) - { - Delegate valueGetter = columns[columnIndex].GetValueGetterUsingCursor(cursor, column); - activeColumnDelegates.Add(valueGetter); - columnIndex++; - } - while (cursor.MoveNext() && cursor.Position < maxRows) + using (DataViewRowCursor cursor = dataView.GetRowCursor(activeColumns)) { - columnIndex = 0; + List activeColumnDelegates = new List(); + int columnIndex = 0; foreach (DataViewSchema.Column column in activeColumns) { - columns[columnIndex].AddValueUsingCursor(cursor, column, activeColumnDelegates[columnIndex]); + Delegate valueGetter = columns[columnIndex].GetValueGetterUsingCursor(cursor, column); + activeColumnDelegates.Add(valueGetter); columnIndex++; } + while (cursor.MoveNext() && cursor.Position < maxRows) + { + columnIndex = 0; + foreach (DataViewSchema.Column column in activeColumns) + { + columns[columnIndex].AddValueUsingCursor(cursor, column, activeColumnDelegates[columnIndex]); + columnIndex++; + } + } } return new DataFrame(columns); diff --git a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs index 7b32fd25da..f91c72802c 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs @@ -776,7 +776,7 @@ private static ValueGetter CreateCharValueGetterDelegate(DataViewRowCurs private static ValueGetter CreateDecimalValueGetterDelegate(DataViewRowCursor cursor, PrimitiveDataFrameColumn column) => (ref double value) => value = (double?)column[cursor.Position] ?? double.NaN; - internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column column, Delegate getter) + protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column column, Delegate getter) { long row = cursor.Position; T value = default; @@ -797,7 +797,7 @@ internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSch } } - internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) + protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) { return cursor.GetGetter(schemaColumn); } diff --git a/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs b/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs index 4b1b42ce13..197cce721d 100644 --- a/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs @@ -468,7 +468,7 @@ protected internal override Delegate GetDataViewGetter(DataViewRowCursor cursor) private ValueGetter> CreateValueGetterDelegate(DataViewRowCursor cursor) => (ref ReadOnlyMemory value) => value = this[cursor.Position].AsMemory(); - internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn, Delegate getter) + protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn, Delegate getter) { long row = cursor.Position; ReadOnlyMemory value = default; @@ -489,7 +489,7 @@ internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSch throw new IndexOutOfRangeException(nameof(row)); } } - internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) + protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) { return cursor.GetGetter>(schemaColumn); } diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameIDataViewTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameIDataViewTests.cs index 1c2285f57b..c090817cf5 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameIDataViewTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameIDataViewTests.cs @@ -322,6 +322,20 @@ private IDataView GetASampleIDataView() return data; } + private void VerifyDataFrameColumnAndDataViewColumnValues(string columnName, IDataView data, DataFrame df, int maxRows = -1) + { + int cc = 0; + var nameDataViewColumn = data.GetColumn(columnName); + foreach (var value in nameDataViewColumn) + { + if (maxRows != -1 && cc >= maxRows) + { + return; + } + Assert.Equal(value, df.Columns[columnName][cc++]); + } + } + [Fact] public void TestDataFrameFromIDataView_MLData() { @@ -334,20 +348,60 @@ public void TestDataFrameFromIDataView_MLData() Assert.Equal(6, column.Length); } - void VerifyDataFrameColumnAndDataViewColumnValues(string columnName) + VerifyDataFrameColumnAndDataViewColumnValues("Name", data, df); + VerifyDataFrameColumnAndDataViewColumnValues("FilterNext", data, df); + VerifyDataFrameColumnAndDataViewColumnValues("Value", data, df); + } + + [Fact] + public void TestDataFrameFromIDataView_MLData_SelectColumns() + { + IDataView data = GetASampleIDataView(); + DataFrame df = data.ToDataFrame("Name", "Value"); + Assert.Equal(6, df.Rows.Count); + Assert.Equal(2, df.Columns.Count); + foreach (var column in df.Columns) { - int cc = 0; - var nameDataViewColumn = data.GetColumn(columnName); - foreach (var value in nameDataViewColumn) - { - Assert.Equal(value, df.Columns[columnName][cc++]); - } + Assert.Equal(6, column.Length); } - VerifyDataFrameColumnAndDataViewColumnValues("Name"); - VerifyDataFrameColumnAndDataViewColumnValues("FilterNext"); - VerifyDataFrameColumnAndDataViewColumnValues("Value"); + VerifyDataFrameColumnAndDataViewColumnValues("Name", data, df); + VerifyDataFrameColumnAndDataViewColumnValues("Value", data, df); } + [Theory] + [InlineData(3)] + [InlineData(0)] + public void TestDataFrameFromIDataView_MLData_SelectRows(int maxRows) + { + IDataView data = GetASampleIDataView(); + DataFrame df = data.ToDataFrame(maxRows); + Assert.Equal(maxRows, df.Rows.Count); + Assert.Equal(3, df.Columns.Count); + foreach (var column in df.Columns) + { + Assert.Equal(maxRows, column.Length); + } + + VerifyDataFrameColumnAndDataViewColumnValues("Name", data, df, maxRows); + VerifyDataFrameColumnAndDataViewColumnValues("FilterNext", data, df, maxRows); + VerifyDataFrameColumnAndDataViewColumnValues("Value", data, df, maxRows); + } + + [Fact] + public void TestDataFrameFromIDataView_MLData_SelectColumnsAndRows() + { + IDataView data = GetASampleIDataView(); + DataFrame df = data.ToDataFrame(3, "Name", "Value"); + Assert.Equal(3, df.Rows.Count); + Assert.Equal(2, df.Columns.Count); + foreach (var column in df.Columns) + { + Assert.Equal(3, column.Length); + } + + VerifyDataFrameColumnAndDataViewColumnValues("Name", data, df, 3); + VerifyDataFrameColumnAndDataViewColumnValues("Value", data, df, 3); + } } } From 9d9f224eff93c728a630ab29c4231ffd0fa2e49e Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Tue, 16 Mar 2021 14:36:11 -0700 Subject: [PATCH 7/9] sq --- src/Microsoft.Data.Analysis/IDataView.Extension.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.Data.Analysis/IDataView.Extension.cs b/src/Microsoft.Data.Analysis/IDataView.Extension.cs index 527c1b3285..5b9c9034bc 100644 --- a/src/Microsoft.Data.Analysis/IDataView.Extension.cs +++ b/src/Microsoft.Data.Analysis/IDataView.Extension.cs @@ -100,12 +100,12 @@ public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, param using (DataViewRowCursor cursor = dataView.GetRowCursor(activeColumns)) { - List activeColumnDelegates = new List(); + Delegate[] activeColumnDelegates = new Delegate[activeColumns.Count]; int columnIndex = 0; foreach (DataViewSchema.Column column in activeColumns) { Delegate valueGetter = columns[columnIndex].GetValueGetterUsingCursor(cursor, column); - activeColumnDelegates.Add(valueGetter); + activeColumnDelegates[columnIndex] = valueGetter; columnIndex++; } while (cursor.MoveNext() && cursor.Position < maxRows) From 1ce802bf66e8bb8b96255dfdc421a71076f453de Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Thu, 18 Mar 2021 14:13:08 -0700 Subject: [PATCH 8/9] Move RowCursor back --- .../DataFrame.IDataView.cs | 88 +++++++++++++++++ src/Microsoft.Data.Analysis/RowCursor.cs | 98 ------------------- 2 files changed, 88 insertions(+), 98 deletions(-) delete mode 100644 src/Microsoft.Data.Analysis/RowCursor.cs diff --git a/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs b/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs index 12a3b4b487..79d5c693fc 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs @@ -4,7 +4,9 @@ using System; using System.Collections.Generic; +using System.Diagnostics; using Microsoft.ML; +using Microsoft.ML.Data; namespace Microsoft.Data.Analysis { @@ -62,5 +64,91 @@ DataViewRowCursor[] IDataView.GetRowCursorSet(IEnumerable // TODO: change to support parallel cursors return new DataViewRowCursor[] { GetRowCursorCore(columnsNeeded) }; } + + private sealed class RowCursor : DataViewRowCursor + { + private bool _disposed; + private long _position; + private readonly DataFrame _dataFrame; + private readonly List _getters; + private Dictionary _columnIndexToGetterIndex; + + public RowCursor(DataFrame dataFrame, bool[] activeColumns) + { + Debug.Assert(dataFrame != null); + Debug.Assert(activeColumns != null); + + _columnIndexToGetterIndex = new Dictionary(); + _position = -1; + _dataFrame = dataFrame; + _getters = new List(); + for (int i = 0; i < Schema.Count; i++) + { + if (!activeColumns[i]) + { + continue; + } + + Delegate getter = CreateGetterDelegate(i); + _getters.Add(getter); + Debug.Assert(getter != null); + _columnIndexToGetterIndex[i] = _getters.Count - 1; + } + } + + public override long Position => _position; + public override long Batch => 0; + public override DataViewSchema Schema => _dataFrame.DataViewSchema; + + protected override void Dispose(bool disposing) + { + if (_disposed) + { + return; + } + + if (disposing) + { + _position = -1; + } + + _disposed = true; + base.Dispose(disposing); + } + + private Delegate CreateGetterDelegate(int col) + { + DataFrameColumn column = _dataFrame.Columns[col]; + return column.GetDataViewGetter(this); + } + + public override ValueGetter GetGetter(DataViewSchema.Column column) + { + if (!IsColumnActive(column)) + throw new ArgumentOutOfRangeException(nameof(column)); + + return (ValueGetter)_getters[_columnIndexToGetterIndex[column.Index]]; + } + + public override ValueGetter GetIdGetter() + { + return (ref DataViewRowId value) => value = new DataViewRowId((ulong)_position, 0); + } + + public override bool IsColumnActive(DataViewSchema.Column column) + { + return _getters[_columnIndexToGetterIndex[column.Index]] != null; + } + + public override bool MoveNext() + { + if (_disposed) + { + return false; + } + _position++; + return _position < _dataFrame.Rows.Count; + } + } } } diff --git a/src/Microsoft.Data.Analysis/RowCursor.cs b/src/Microsoft.Data.Analysis/RowCursor.cs deleted file mode 100644 index feb3e7ec9f..0000000000 --- a/src/Microsoft.Data.Analysis/RowCursor.cs +++ /dev/null @@ -1,98 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.Diagnostics; -using Microsoft.ML; -using Microsoft.ML.Data; - -namespace Microsoft.Data.Analysis -{ - internal sealed class RowCursor : DataViewRowCursor - { - private bool _disposed; - private long _position; - private readonly DataFrame _dataFrame; - internal readonly List _getters; - private Dictionary _columnIndexToGetterIndex; - - public RowCursor(DataFrame dataFrame, bool[] activeColumns) - { - Debug.Assert(dataFrame != null); - Debug.Assert(activeColumns != null); - - _columnIndexToGetterIndex = new Dictionary(); - _position = -1; - _dataFrame = dataFrame; - _getters = new List(); - for (int i = 0; i < Schema.Count; i++) - { - if (!activeColumns[i]) - { - continue; - } - - Delegate getter = CreateGetterDelegate(i); - _getters.Add(getter); - Debug.Assert(getter != null); - _columnIndexToGetterIndex[i] = _getters.Count - 1; - } - } - - public override long Position => _position; - public override long Batch => 0; - public override DataViewSchema Schema => _dataFrame.DataViewSchema; - - protected override void Dispose(bool disposing) - { - if (_disposed) - { - return; - } - - if (disposing) - { - _position = -1; - } - - _disposed = true; - base.Dispose(disposing); - } - - private Delegate CreateGetterDelegate(int col) - { - DataFrameColumn column = _dataFrame.Columns[col]; - return column.GetDataViewGetter(this); - } - - public override ValueGetter GetGetter(DataViewSchema.Column column) - { - if (!IsColumnActive(column)) - throw new ArgumentOutOfRangeException(nameof(column)); - - return (ValueGetter)_getters[_columnIndexToGetterIndex[column.Index]]; - } - - public override ValueGetter GetIdGetter() - { - return (ref DataViewRowId value) => value = new DataViewRowId((ulong)_position, 0); - } - - public override bool IsColumnActive(DataViewSchema.Column column) - { - return _getters[_columnIndexToGetterIndex[column.Index]] != null; - } - - public override bool MoveNext() - { - if (_disposed) - { - return false; - } - _position++; - return _position < _dataFrame.Rows.Count; - } - } -} From 120349523c70cd666ed72c60d333018fb93861ad Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Fri, 19 Mar 2021 11:04:49 -0700 Subject: [PATCH 9/9] Remove unused param Docs maxRows More unit tests Fixed ArrowStringDataFrameColumn construction in the unit test --- .../DataFrame.IDataView.cs | 29 +++----- .../DataFrameColumn.cs | 5 +- .../IDataView.Extension.cs | 72 ++++++++++++------- .../PrimitiveDataFrameColumn.cs | 2 +- .../StringDataFrameColumn.cs | 3 +- .../DataFrameIDataViewTests.cs | 30 ++++++-- .../DataFrameTests.cs | 10 +-- 7 files changed, 87 insertions(+), 64 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs b/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs index 79d5c693fc..27de92da69 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IDataView.cs @@ -16,7 +16,7 @@ public partial class DataFrame : IDataView bool IDataView.CanShuffle => false; private DataViewSchema _schema; - internal DataViewSchema DataViewSchema + private DataViewSchema DataViewSchema { get { @@ -70,29 +70,22 @@ private sealed class RowCursor : DataViewRowCursor private bool _disposed; private long _position; private readonly DataFrame _dataFrame; - private readonly List _getters; - private Dictionary _columnIndexToGetterIndex; + private readonly Delegate[] _getters; public RowCursor(DataFrame dataFrame, bool[] activeColumns) { Debug.Assert(dataFrame != null); Debug.Assert(activeColumns != null); - _columnIndexToGetterIndex = new Dictionary(); _position = -1; _dataFrame = dataFrame; - _getters = new List(); - for (int i = 0; i < Schema.Count; i++) + _getters = new Delegate[Schema.Count]; + for (int i = 0; i < _getters.Length; i++) { if (!activeColumns[i]) - { continue; - } - - Delegate getter = CreateGetterDelegate(i); - _getters.Add(getter); - Debug.Assert(getter != null); - _columnIndexToGetterIndex[i] = _getters.Count - 1; + _getters[i] = CreateGetterDelegate(i); + Debug.Assert(_getters[i] != null); } } @@ -103,15 +96,11 @@ public RowCursor(DataFrame dataFrame, bool[] activeColumns) protected override void Dispose(bool disposing) { if (_disposed) - { return; - } - if (disposing) { _position = -1; } - _disposed = true; base.Dispose(disposing); } @@ -127,7 +116,7 @@ public override ValueGetter GetGetter(DataViewSchema.Column colu if (!IsColumnActive(column)) throw new ArgumentOutOfRangeException(nameof(column)); - return (ValueGetter)_getters[_columnIndexToGetterIndex[column.Index]]; + return (ValueGetter)_getters[column.Index]; } public override ValueGetter GetIdGetter() @@ -137,15 +126,13 @@ public override ValueGetter GetIdGetter() public override bool IsColumnActive(DataViewSchema.Column column) { - return _getters[_columnIndexToGetterIndex[column.Index]] != null; + return _getters[column.Index] != null; } public override bool MoveNext() { if (_disposed) - { return false; - } _position++; return _position < _dataFrame.Rows.Count; } diff --git a/src/Microsoft.Data.Analysis/DataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumn.cs index 346dd4f242..bd21d6fe96 100644 --- a/src/Microsoft.Data.Analysis/DataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/DataFrameColumn.cs @@ -251,15 +251,14 @@ public virtual DataFrameColumn Sort(bool ascending = true) /// Appends a value to this using /// /// The row cursor which has the current position - /// The in /// The cached ValueGetter for this column. - protected internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn, Delegate ValueGetter) => throw new NotImplementedException(); + protected internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, Delegate ValueGetter) => throw new NotImplementedException(); /// /// Returns the ValueGetter for each active column in as a delegate to be cached. /// /// The row cursor which has the current position - /// The in + /// The to return the ValueGetter for. protected internal virtual Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) => throw new NotImplementedException(); /// diff --git a/src/Microsoft.Data.Analysis/IDataView.Extension.cs b/src/Microsoft.Data.Analysis/IDataView.Extension.cs index 5b9c9034bc..32b97d365a 100644 --- a/src/Microsoft.Data.Analysis/IDataView.Extension.cs +++ b/src/Microsoft.Data.Analysis/IDataView.Extension.cs @@ -13,20 +13,40 @@ public static class IDataViewExtensions { private const int defaultMaxRows = 100; + /// + /// Returns a from this . + /// + /// The current . + /// The max number or rows in the . Defaults to 100. Use -1 to construct a DataFrame using all the rows in . + /// A with . public static DataFrame ToDataFrame(this IDataView dataView, long maxRows = defaultMaxRows) { return ToDataFrame(dataView, maxRows, null); } + /// + /// Returns a with the first 100 rows of this . + /// + /// The current . + /// The columns selected for the resultant DataFrame + /// A with the selected columns and 100 rows. public static DataFrame ToDataFrame(this IDataView dataView, params string[] selectColumns) { return ToDataFrame(dataView, defaultMaxRows, selectColumns); } + /// + /// Returns a with the first of this . + /// + /// The current . + /// The max number or rows in the . Use -1 to construct a DataFrame using all the rows in . + /// The columns selected for the resultant DataFrame + /// A with the selected columns and rows. public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, params string[] selectColumns) { DataViewSchema schema = dataView.Schema; - List columns = new List(schema.Count); + List dataFrameColumns = new List(schema.Count); + maxRows = maxRows == -1 ? long.MaxValue : maxRows; HashSet selectColumnsSet = null; if (selectColumns != null && selectColumns.Length > 0) @@ -34,63 +54,63 @@ public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, param selectColumnsSet = new HashSet(selectColumns); } - List activeColumns = new List(); - foreach (DataViewSchema.Column column in schema) + List activeDataViewColumns = new List(); + foreach (DataViewSchema.Column dataViewColumn in schema) { - if (column.IsHidden || (selectColumnsSet != null && !selectColumnsSet.Contains(column.Name))) + if (dataViewColumn.IsHidden || (selectColumnsSet != null && !selectColumnsSet.Contains(dataViewColumn.Name))) { continue; } - activeColumns.Add(column); - DataViewType type = column.Type; + activeDataViewColumns.Add(dataViewColumn); + DataViewType type = dataViewColumn.Type; if (type == BooleanDataViewType.Instance) { - columns.Add(new BooleanDataFrameColumn(column.Name)); + dataFrameColumns.Add(new BooleanDataFrameColumn(dataViewColumn.Name)); } else if (type == NumberDataViewType.Byte) { - columns.Add(new ByteDataFrameColumn(column.Name)); + dataFrameColumns.Add(new ByteDataFrameColumn(dataViewColumn.Name)); } else if (type == NumberDataViewType.Double) { - columns.Add(new DoubleDataFrameColumn(column.Name)); + dataFrameColumns.Add(new DoubleDataFrameColumn(dataViewColumn.Name)); } else if (type == NumberDataViewType.Single) { - columns.Add(new SingleDataFrameColumn(column.Name)); + dataFrameColumns.Add(new SingleDataFrameColumn(dataViewColumn.Name)); } else if (type == NumberDataViewType.Int32) { - columns.Add(new Int32DataFrameColumn(column.Name)); + dataFrameColumns.Add(new Int32DataFrameColumn(dataViewColumn.Name)); } else if (type == NumberDataViewType.Int64) { - columns.Add(new Int64DataFrameColumn(column.Name)); + dataFrameColumns.Add(new Int64DataFrameColumn(dataViewColumn.Name)); } else if (type == NumberDataViewType.SByte) { - columns.Add(new SByteDataFrameColumn(column.Name)); + dataFrameColumns.Add(new SByteDataFrameColumn(dataViewColumn.Name)); } else if (type == NumberDataViewType.Int16) { - columns.Add(new Int16DataFrameColumn(column.Name)); + dataFrameColumns.Add(new Int16DataFrameColumn(dataViewColumn.Name)); } else if (type == NumberDataViewType.UInt32) { - columns.Add(new UInt32DataFrameColumn(column.Name)); + dataFrameColumns.Add(new UInt32DataFrameColumn(dataViewColumn.Name)); } else if (type == NumberDataViewType.UInt64) { - columns.Add(new UInt64DataFrameColumn(column.Name)); + dataFrameColumns.Add(new UInt64DataFrameColumn(dataViewColumn.Name)); } else if (type == NumberDataViewType.UInt16) { - columns.Add(new UInt16DataFrameColumn(column.Name)); + dataFrameColumns.Add(new UInt16DataFrameColumn(dataViewColumn.Name)); } else if (type == TextDataViewType.Instance) { - columns.Add(new StringDataFrameColumn(column.Name)); + dataFrameColumns.Add(new StringDataFrameColumn(dataViewColumn.Name)); } else { @@ -98,28 +118,26 @@ public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, param } } - using (DataViewRowCursor cursor = dataView.GetRowCursor(activeColumns)) + using (DataViewRowCursor cursor = dataView.GetRowCursor(activeDataViewColumns)) { - Delegate[] activeColumnDelegates = new Delegate[activeColumns.Count]; + Delegate[] activeColumnDelegates = new Delegate[activeDataViewColumns.Count]; int columnIndex = 0; - foreach (DataViewSchema.Column column in activeColumns) + foreach (DataViewSchema.Column activeDataViewColumn in activeDataViewColumns) { - Delegate valueGetter = columns[columnIndex].GetValueGetterUsingCursor(cursor, column); + Delegate valueGetter = dataFrameColumns[columnIndex].GetValueGetterUsingCursor(cursor, activeDataViewColumn); activeColumnDelegates[columnIndex] = valueGetter; columnIndex++; } while (cursor.MoveNext() && cursor.Position < maxRows) { - columnIndex = 0; - foreach (DataViewSchema.Column column in activeColumns) + for (int i = 0; i < activeColumnDelegates.Length; i++) { - columns[columnIndex].AddValueUsingCursor(cursor, column, activeColumnDelegates[columnIndex]); - columnIndex++; + dataFrameColumns[i].AddValueUsingCursor(cursor, activeColumnDelegates[i]); } } } - return new DataFrame(columns); + return new DataFrame(dataFrameColumns); } } diff --git a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs index f91c72802c..a7e7d20cb9 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs @@ -776,7 +776,7 @@ private static ValueGetter CreateCharValueGetterDelegate(DataViewRowCurs private static ValueGetter CreateDecimalValueGetterDelegate(DataViewRowCursor cursor, PrimitiveDataFrameColumn column) => (ref double value) => value = (double?)column[cursor.Position] ?? double.NaN; - protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column column, Delegate getter) + protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, Delegate getter) { long row = cursor.Position; T value = default; diff --git a/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs b/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs index 197cce721d..7ada30e10c 100644 --- a/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs +++ b/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs @@ -468,7 +468,7 @@ protected internal override Delegate GetDataViewGetter(DataViewRowCursor cursor) private ValueGetter> CreateValueGetterDelegate(DataViewRowCursor cursor) => (ref ReadOnlyMemory value) => value = this[cursor.Position].AsMemory(); - protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn, Delegate getter) + protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, Delegate getter) { long row = cursor.Position; ReadOnlyMemory value = default; @@ -489,6 +489,7 @@ protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, D throw new IndexOutOfRangeException(nameof(row)); } } + protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) { return cursor.GetGetter>(schemaColumn); diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameIDataViewTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameIDataViewTests.cs index c090817cf5..dea8099876 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameIDataViewTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameIDataViewTests.cs @@ -252,25 +252,41 @@ public void TestDataFrameFromIDataView_SelectColumns() Assert.True(df.Columns["Double"].ElementwiseEquals(newDf.Columns["Double"]).All()); } - [Fact] - public void TestDataFrameFromIDataView_SelectRows() + [Theory] + [InlineData(10, 5)] + [InlineData(110, 100)] + [InlineData(110, -1)] + public void TestDataFrameFromIDataView_SelectRows(int dataFrameSize, int rowSize) { - DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(10, withNulls: false); + DataFrame df = DataFrameTests.MakeDataFrameWithAllColumnTypes(dataFrameSize, withNulls: false); df.Columns.Remove("Char"); // Because chars are returned as uint16 by DataViewSchema, so end up comparing CharDataFrameColumn to UInt16DataFrameColumn and fail asserts df.Columns.Remove("Decimal"); // Because decimal is returned as double by DataViewSchema, so end up comparing DecimalDataFrameColumn to DoubleDataFrameColumn and fail asserts IDataView dfAsIDataView = df; - DataFrame newDf = dfAsIDataView.ToDataFrame(5); - Assert.Equal(5, newDf.Rows.Count); + DataFrame newDf; + if (rowSize == 100) + { + // Test default + newDf = dfAsIDataView.ToDataFrame(); + } + else + { + newDf = dfAsIDataView.ToDataFrame(rowSize); + } + if (rowSize == -1) + { + rowSize = dataFrameSize; + } + Assert.Equal(rowSize, newDf.Rows.Count); Assert.Equal(df.Columns.Count, newDf.Columns.Count); for (int i = 0; i < newDf.Columns.Count; i++) { - Assert.Equal(5, newDf.Columns[i].Length); + Assert.Equal(rowSize, newDf.Columns[i].Length); Assert.Equal(df.Columns[i].Name, newDf.Columns[i].Name); } Assert.Equal(dfAsIDataView.Schema.Count, newDf.Columns.Count); for (int c = 0; c < df.Columns.Count; c++) { - for (int r = 0; r < 5; r++) + for (int r = 0; r < rowSize; r++) { Assert.Equal(df.Columns[c][r], newDf.Columns[c][r]); } diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index c277aae36e..300babbffb 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -62,10 +62,12 @@ public static ArrowStringDataFrameColumn CreateArrowStringColumn(int length, boo // write the current length to (index + 1) int offsetIndex = (i + 1) * 4; - offsetMemory[offsetIndex++] = (byte)(3 * validStringsIndex); - offsetMemory[offsetIndex++] = 0; - offsetMemory[offsetIndex++] = 0; - offsetMemory[offsetIndex++] = 0; + int offsetValue = 3 * validStringsIndex; + byte[] offsetValueBytes = BitConverter.GetBytes(offsetValue); + offsetMemory[offsetIndex++] = offsetValueBytes[0]; + offsetMemory[offsetIndex++] = offsetValueBytes[1]; + offsetMemory[offsetIndex++] = offsetValueBytes[2]; + offsetMemory[offsetIndex++] = offsetValueBytes[3]; } int nullCount = withNulls ? 1 : 0;