Skip to content

Commit

Permalink
Bring ensembles into codebase (dotnet#379)
Browse files Browse the repository at this point in the history
Introduce Ensemble codebase
  • Loading branch information
Ivanidzo4ka authored and eerhardt committed Jul 27, 2018
1 parent 33c28a7 commit 05863c8
Show file tree
Hide file tree
Showing 73 changed files with 19,405 additions and 10,044 deletions.
7 changes: 7 additions & 0 deletions Microsoft.ML.sln
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Sweeper.Tests"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.LightGBM", "src\Microsoft.ML.LightGBM\Microsoft.ML.LightGBM.csproj", "{001F3B4E-FBE4-4001-AFD2-A6A989CD1C25}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Ensemble", "src\Microsoft.ML.Ensemble\Microsoft.ML.Ensemble.csproj", "{DCF46B79-1FDB-4DBA-A263-D3D64E3AAA27}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -228,6 +230,10 @@ Global
{001F3B4E-FBE4-4001-AFD2-A6A989CD1C25}.Debug|Any CPU.Build.0 = Debug|Any CPU
{001F3B4E-FBE4-4001-AFD2-A6A989CD1C25}.Release|Any CPU.ActiveCfg = Release|Any CPU
{001F3B4E-FBE4-4001-AFD2-A6A989CD1C25}.Release|Any CPU.Build.0 = Release|Any CPU
{DCF46B79-1FDB-4DBA-A263-D3D64E3AAA27}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{DCF46B79-1FDB-4DBA-A263-D3D64E3AAA27}.Debug|Any CPU.Build.0 = Debug|Any CPU
{DCF46B79-1FDB-4DBA-A263-D3D64E3AAA27}.Release|Any CPU.ActiveCfg = Release|Any CPU
{DCF46B79-1FDB-4DBA-A263-D3D64E3AAA27}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -267,6 +273,7 @@ Global
{9252A8EB-ABFB-440C-AB4D-1D562753CE0F} = {487213C9-E8A9-4F94-85D7-28A05DBBFE3A}
{3DEB504D-7A07-48CE-91A2-8047461CB3D4} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
{001F3B4E-FBE4-4001-AFD2-A6A989CD1C25} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
{DCF46B79-1FDB-4DBA-A263-D3D64E3AAA27} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}
Expand Down
22 changes: 22 additions & 0 deletions src/Microsoft.ML.Ensemble/Batch.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.ML.Runtime.Data;

namespace Microsoft.ML.Runtime.Ensemble
{
public sealed class Batch
{
public readonly RoleMappedData TrainInstances;
public readonly RoleMappedData TestInstances;

public Batch(RoleMappedData trainData, RoleMappedData testData)
{
Contracts.CheckValue(trainData, nameof(trainData));
Contracts.CheckValue(testData, nameof(testData));
TrainInstances = trainData;
TestInstances = testData;
}
}
}
114 changes: 114 additions & 0 deletions src/Microsoft.ML.Ensemble/EnsembleUtils.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections;
using Microsoft.ML.Runtime.Data;
using Microsoft.ML.Runtime.Internal.Utilities;

namespace Microsoft.ML.Runtime.Ensemble
{
internal static class EnsembleUtils
{
/// <summary>
/// Return a dataset with non-selected features zeroed out.
/// </summary>
public static RoleMappedData SelectFeatures(IHost host, RoleMappedData data, BitArray features)
{
Contracts.AssertValue(host);
Contracts.AssertValue(data);
Contracts.Assert(data.Schema.Feature != null);
Contracts.AssertValue(features);

var type = data.Schema.Feature.Type;
Contracts.Assert(features.Length == type.VectorSize);
int card = Utils.GetCardinality(features);
if (card == type.VectorSize)
return data;

// REVIEW: This doesn't preserve metadata on the features column. Should it?
var name = data.Schema.Feature.Name;
var view = LambdaColumnMapper.Create(
host, "FeatureSelector", data.Data, name, name, type, type,
(ref VBuffer<Single> src, ref VBuffer<Single> dst) => SelectFeatures(ref src, features, card, ref dst));

var res = RoleMappedData.Create(view, data.Schema.GetColumnRoleNames());
return res;
}

/// <summary>
/// Fill dst with values selected from src if the indices of the src values are set in includedIndices,
/// otherwise assign default(T). The length of dst will be equal to src.Length.
/// </summary>
public static void SelectFeatures<T>(ref VBuffer<T> src, BitArray includedIndices, int cardinality, ref VBuffer<T> dst)
{
Contracts.Assert(Utils.Size(includedIndices) == src.Length);
Contracts.Assert(cardinality == Utils.GetCardinality(includedIndices));
Contracts.Assert(cardinality < src.Length);

var values = dst.Values;
var indices = dst.Indices;

if (src.IsDense)
{
if (cardinality >= src.Length / 2)
{
T defaultValue = default;
if (Utils.Size(values) < src.Length)
values = new T[src.Length];
for (int i = 0; i < src.Length; i++)
values[i] = !includedIndices[i] ? defaultValue : src.Values[i];
dst = new VBuffer<T>(src.Length, values, indices);
}
else
{
if (Utils.Size(values) < cardinality)
values = new T[cardinality];
if (Utils.Size(indices) < cardinality)
indices = new int[cardinality];

int count = 0;
for (int i = 0; i < src.Length; i++)
{
if (includedIndices[i])
{
Contracts.Assert(count < cardinality);
values[count] = src.Values[i];
indices[count] = i;
count++;
}
}

Contracts.Assert(count == cardinality);
dst = new VBuffer<T>(src.Length, count, values, indices);
}
}
else
{
int valuesSize = Utils.Size(values);
int indicesSize = Utils.Size(indices);
if (valuesSize < src.Count || indicesSize < src.Count)
{
if (valuesSize < cardinality)
values = new T[cardinality];
if (indicesSize < cardinality)
indices = new int[cardinality];
}

int count = 0;
for (int i = 0; i < src.Count; i++)
{
if (includedIndices[src.Indices[i]])
{
values[count] = src.Values[i];
indices[count] = src.Indices[i];
count++;
}
}

dst = new VBuffer<T>(src.Length, count, values, indices);
}
}
}
}
Loading

0 comments on commit 05863c8

Please sign in to comment.