diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 78447515e1..99578ed064 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -26,6 +26,7 @@ Please feel free to search this page and use any code that suits your needs. ### List of recipes - [How do I load data from a text file?](#how-do-i-load-data-from-a-text-file) +- [How can I read and write binary data?](#how-can-i-read-and-write-binary-data) - [How do I load data with many columns from a CSV?](#how-do-i-load-data-with-many-columns-from-a-csv) - [How do I debug my experiment or preview my pipeline?](#how-do-i-debug-my-experiment-or-preview-my-pipeline) - [How do I look at the intermediate data?](#how-do-i-look-at-the-intermediate-data) @@ -141,6 +142,52 @@ var data = mlContext.Data.LoadFromTextFile(dataPath, ``` +## How can I read and write binary data? +Other than using text files, ML.NET will allow you to read and write binary data. This has a few advantages such as not having to specify a schema, can improve reading times, and are generally smaller than text files. + +To write binary data you need some data to be able to save. Specifically you need an instance of an `IDavaView`. Below is a code snippet that uses the iris data as an example. + +```csharp +// Data model for the iris data +public class IrisData +{ + public float Label { get; set; } + public float SepalLength { get; set; } + public float SepalWidth { get; set; } + public float PetalLength { get; set; } + public float PetalWidth { get; set; } +} + +// An array of iris data points +var dataArray = new[] +{ + new IrisData { Label=1, PetalLength=1, SepalLength=1, PetalWidth=1, SepalWidth=1 }, + new IrisData { Label=0, PetalLength=2, SepalLength=2, PetalWidth=2, SepalWidth=2 } +}; + +// Create the ML.NET context. +var context = new MLContext(); + +// Create the data view from an IEnumerable. +// This method will use the definition of IrisData to understand what columns there are +// in the data view. However, the objects in ML.NET are only "promises" of data since +// ML.NET operations are lazy. One way to get a look at the data is with Schema Comprehension. +// Refer to this document for more information - https://github.com/dotnet/machinelearning/blob/master/docs/code/SchemaComprehension.md +var data = context.Data.LoadFromEnumerable(dataArray); + +// Use a FileStream to create a file. Use the stream and the data view in the "SaveAsBinary" method. +using(var stream = new FileStream("./iris.idv", FileMode.Create)) +{ + context.Data.SaveAsBinary(data, stream); +} +``` + +To read a binary file, simply use the `context.Data.ReadFromBinary` method and pass in the path of the binary file to read in. Notice that the schema of the data does not need to be defined here. + +```csharp +var data = context.Data.ReadFromBinary("./iris.idv"); +``` + ## How do I load data from multiple files? You can again use the `TextLoader`, and specify an array of files to its Load method. diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndReadFromBinary.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndReadFromBinary.cs new file mode 100644 index 0000000000..4770881015 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndReadFromBinary.cs @@ -0,0 +1,30 @@ +using System.Collections.Generic; +using System.IO; +using Microsoft.ML.SamplesUtils; + +namespace Microsoft.ML.Samples.Dynamic.DataOperations +{ + public class SaveAndLoadFromBinary + { + public static void Example() + { + MLContext mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable. + IEnumerable enumerableOfData = DatasetUtils.GetSampleTemperatureData(5); + + // Load dataset into an IDataView. + IDataView data = mlContext.Data.LoadFromEnumerable(enumerableOfData); + + // Creating a FileStream object to create a file and use + // the stream to create a binary file. + using (FileStream stream = new FileStream("./sample-temp-data.idv", FileMode.Create)) + { + mlContext.Data.SaveAsBinary(data, stream); + } + + // Load a binary file by file path. + IDataView binaryData = mlContext.Data.LoadFromBinary("./sample-temp-data.idv"); + } + } +} diff --git a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj index 8f6dd9fb9b..2daa050064 100644 --- a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj +++ b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj @@ -6,6 +6,7 @@ + @@ -49,6 +50,7 @@ + diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index cb4ad1f999..dc7b09f393 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -501,6 +501,32 @@ public override Action GetMapping() } } + [Fact] + public void ReadAndWriteBinaryData() + { + // An array of iris data points + IrisInput[] dataArray = new[] + { + new IrisInput { IgnoredLabel="1", PetalLength=1, SepalLength=1, PetalWidth=1, SepalWidth=1 }, + new IrisInput { IgnoredLabel="0", PetalLength=2, SepalLength=2, PetalWidth=2, SepalWidth=2 } + }; + + // Create the ML.NET context. + MLContext context = new MLContext(); + + // Assume that we already have an IDataView, which could be the result of loading text data, + // or the result of some transformation. + IDataView data = context.Data.LoadFromEnumerable(dataArray); + + // Use a FileStream to create a file. Use the stream and the data view in the "SaveAsBinary" method. + using (FileStream stream = new FileStream("./iris.idv", FileMode.Create)) + { + context.Data.SaveAsBinary(data, stream); + } + + DeleteOutputPath("iris.idv"); + } + private static void RunEndToEnd(MLContext mlContext, IDataView trainData, string modelPath) { // Construct the learning pipeline. Note that we are now providing a contract name for the custom mapping: