Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IDataView to DataFrame #5712

Merged
merged 9 commits into from
Mar 22, 2021
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 23 additions & 9 deletions src/Microsoft.Data.Analysis/DataFrame.IDataView.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@
namespace Microsoft.Data.Analysis
{
public partial class DataFrame : IDataView
{
{
// TODO: support shuffling
bool IDataView.CanShuffle => false;

private DataViewSchema _schema;
private DataViewSchema DataViewSchema
internal DataViewSchema DataViewSchema
{
get
{
Expand Down Expand Up @@ -53,6 +53,7 @@ private DataViewRowCursor GetRowCursorCore(IEnumerable<DataViewSchema.Column> co

return new RowCursor(this, activeColumns);
}

DataViewRowCursor IDataView.GetRowCursor(IEnumerable<DataViewSchema.Column> columnsNeeded, Random rand)
{
return GetRowCursorCore(columnsNeeded);
Expand All @@ -69,22 +70,29 @@ private sealed class RowCursor : DataViewRowCursor
private bool _disposed;
private long _position;
private readonly DataFrame _dataFrame;
private readonly Delegate[] _getters;
private readonly List<Delegate> _getters;
private Dictionary<int, int> _columnIndexToGetterIndex;

public RowCursor(DataFrame dataFrame, bool[] activeColumns)
{
Debug.Assert(dataFrame != null);
Debug.Assert(activeColumns != null);

_columnIndexToGetterIndex = new Dictionary<int, int>();
_position = -1;
_dataFrame = dataFrame;
_getters = new Delegate[Schema.Count];
for (int i = 0; i < _getters.Length; i++)
_getters = new List<Delegate>();
for (int i = 0; i < Schema.Count; i++)
{
if (!activeColumns[i])
{
continue;
_getters[i] = CreateGetterDelegate(i);
Debug.Assert(_getters[i] != null);
}

Delegate getter = CreateGetterDelegate(i);
_getters.Add(getter);
Debug.Assert(getter != null);
_columnIndexToGetterIndex[i] = _getters.Count - 1;
}
}

Expand All @@ -95,11 +103,15 @@ public RowCursor(DataFrame dataFrame, bool[] activeColumns)
protected override void Dispose(bool disposing)
{
if (_disposed)
{
return;
}

if (disposing)
{
_position = -1;
}

_disposed = true;
base.Dispose(disposing);
}
Expand All @@ -115,7 +127,7 @@ public override ValueGetter<TValue> GetGetter<TValue>(DataViewSchema.Column colu
if (!IsColumnActive(column))
throw new ArgumentOutOfRangeException(nameof(column));

return (ValueGetter<TValue>)_getters[column.Index];
return (ValueGetter<TValue>)_getters[_columnIndexToGetterIndex[column.Index]];
}

public override ValueGetter<DataViewRowId> GetIdGetter()
Expand All @@ -125,13 +137,15 @@ public override ValueGetter<DataViewRowId> GetIdGetter()

public override bool IsColumnActive(DataViewSchema.Column column)
{
return _getters[column.Index] != null;
return _getters[_columnIndexToGetterIndex[column.Index]] != null;
}

public override bool MoveNext()
{
if (_disposed)
{
return false;
}
_position++;
return _position < _dataFrame.Rows.Count;
}
Expand Down
15 changes: 15 additions & 0 deletions src/Microsoft.Data.Analysis/DataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,21 @@ public virtual DataFrameColumn Sort(bool ascending = true)
/// </param>
protected internal virtual void AddDataViewColumn(DataViewSchema.Builder builder) => throw new NotImplementedException();

/// <summary>
/// Appends a value to this <see cref="DataFrameColumn"/> using <paramref name="cursor"/>
/// </summary>
/// <param name="cursor">The row cursor which has the current position</param>
/// <param name="schemaColumn">The <see cref="DataViewSchema.Column"/> in <see cref="DataViewSchema"/></param>
/// <param name="ValueGetter">The cached ValueGetter for this column.</param>
protected internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn, Delegate ValueGetter) => throw new NotImplementedException();

/// <summary>
/// Returns the ValueGetter for each active column in <paramref name="cursor"/> as a delegate to be cached.
/// </summary>
/// <param name="cursor">The row cursor which has the current position</param>
/// <param name="schemaColumn">The <see cref="DataViewSchema.Column"/> in <see cref="DataViewSchema"/></param>
protected internal virtual Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) => throw new NotImplementedException();

/// <summary>
/// Clamps values beyond the specified thresholds
/// </summary>
Expand Down
126 changes: 126 additions & 0 deletions src/Microsoft.Data.Analysis/IDataView.Extension.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using Microsoft.Data.Analysis;
using Microsoft.ML.Data;

namespace Microsoft.ML
{
public static class IDataViewExtensions
{
private const int defaultMaxRows = 100;

public static DataFrame ToDataFrame(this IDataView dataView, long maxRows = defaultMaxRows)
{
return ToDataFrame(dataView, maxRows, null);
}

public static DataFrame ToDataFrame(this IDataView dataView, params string[] selectColumns)
{
return ToDataFrame(dataView, defaultMaxRows, selectColumns);
}

public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, params string[] selectColumns)
{
DataViewSchema schema = dataView.Schema;
List<DataFrameColumn> columns = new List<DataFrameColumn>(schema.Count);

HashSet<string> selectColumnsSet = null;
if (selectColumns != null && selectColumns.Length > 0)
{
selectColumnsSet = new HashSet<string>(selectColumns);
}

List<DataViewSchema.Column> activeColumns = new List<DataViewSchema.Column>();
foreach (DataViewSchema.Column column in schema)
{
if (column.IsHidden || (selectColumnsSet != null && !selectColumnsSet.Contains(column.Name)))
{
continue;
}

activeColumns.Add(column);
DataViewType type = column.Type;
if (type == BooleanDataViewType.Instance)
{
columns.Add(new BooleanDataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.Byte)
{
columns.Add(new ByteDataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.Double)
{
columns.Add(new DoubleDataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.Single)
{
columns.Add(new SingleDataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.Int32)
{
columns.Add(new Int32DataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.Int64)
{
columns.Add(new Int64DataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.SByte)
{
columns.Add(new SByteDataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.Int16)
{
columns.Add(new Int16DataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.UInt32)
{
columns.Add(new UInt32DataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.UInt64)
{
columns.Add(new UInt64DataFrameColumn(column.Name));
}
else if (type == NumberDataViewType.UInt16)
{
columns.Add(new UInt16DataFrameColumn(column.Name));
}
else if (type == TextDataViewType.Instance)
{
columns.Add(new StringDataFrameColumn(column.Name));
}
else
{
throw new NotSupportedException(String.Format(Microsoft.Data.Strings.NotSupportedColumnType, type.RawType.Name));
Copy link
Author

@pgovind pgovind Mar 16, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will cause a problem for vector types in IDataView I think. We'd need to add support for vector columns in DataFrame to fix this. I'll open a bug

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}
}

using (DataViewRowCursor cursor = dataView.GetRowCursor(activeColumns))
{
Delegate[] activeColumnDelegates = new Delegate[activeColumns.Count];
int columnIndex = 0;
foreach (DataViewSchema.Column column in activeColumns)
{
Delegate valueGetter = columns[columnIndex].GetValueGetterUsingCursor(cursor, column);
activeColumnDelegates[columnIndex] = valueGetter;
columnIndex++;
}
while (cursor.MoveNext() && cursor.Position < maxRows)
pgovind marked this conversation as resolved.
Show resolved Hide resolved
{
columnIndex = 0;
foreach (DataViewSchema.Column column in activeColumns)
pgovind marked this conversation as resolved.
Show resolved Hide resolved
{
columns[columnIndex].AddValueUsingCursor(cursor, column, activeColumnDelegates[columnIndex]);
columnIndex++;
}
}
}

return new DataFrame(columns);
}
}

}
26 changes: 26 additions & 0 deletions src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -775,5 +775,31 @@ private static ValueGetter<ushort> CreateCharValueGetterDelegate(DataViewRowCurs

private static ValueGetter<double> CreateDecimalValueGetterDelegate(DataViewRowCursor cursor, PrimitiveDataFrameColumn<decimal> column) =>
(ref double value) => value = (double?)column[cursor.Position] ?? double.NaN;

protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column column, Delegate getter)
pgovind marked this conversation as resolved.
Show resolved Hide resolved
{
long row = cursor.Position;
T value = default;
Debug.Assert(getter != null, "Excepted getter to be valid");
(getter as ValueGetter<T>)(ref value);

if (Length > row)
{
this[row] = value;
}
else if (Length == row)
{
Append(value);
}
else
{
throw new IndexOutOfRangeException(nameof(row));
}
}

protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn)
{
return cursor.GetGetter<T>(schemaColumn);
}
}
}
26 changes: 26 additions & 0 deletions src/Microsoft.Data.Analysis/StringDataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -467,5 +467,31 @@ protected internal override Delegate GetDataViewGetter(DataViewRowCursor cursor)

private ValueGetter<ReadOnlyMemory<char>> CreateValueGetterDelegate(DataViewRowCursor cursor) =>
(ref ReadOnlyMemory<char> value) => value = this[cursor.Position].AsMemory();

protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn, Delegate getter)
{
long row = cursor.Position;
ReadOnlyMemory<char> value = default;
Debug.Assert(getter != null, "Excepted getter to be valid");

(getter as ValueGetter<ReadOnlyMemory<char>>)(ref value);

if (Length > row)
{
this[row] = value.ToString();
}
else if (Length == row)
{
Append(value.ToString());
}
else
{
throw new IndexOutOfRangeException(nameof(row));
}
}
protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn)
{
return cursor.GetGetter<ReadOnlyMemory<char>>(schemaColumn);
}
}
}
9 changes: 9 additions & 0 deletions src/Microsoft.Data.Analysis/strings.Designer.cs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion src/Microsoft.Data.Analysis/strings.resx
Original file line number Diff line number Diff line change
Expand Up @@ -183,10 +183,13 @@
<data name="NonSeekableStream" xml:space="preserve">
<value>Expected a seekable stream</value>
</data>
<data name="NotSupportedColumnType" xml:space="preserve">
<value>{0} is not a supported column type.</value>
</data>
<data name="NumericColumnType" xml:space="preserve">
<value>numeric column</value>
</data>
<data name="SpansMultipleBuffers" xml:space="preserve">
<value>Cannot span multiple buffers</value>
</data>
</root>
</root>
Loading