From a07091baeaaf447c08b5af14736c1db2a9146215 Mon Sep 17 00:00:00 2001 From: tyclintw Date: Mon, 7 May 2018 14:21:44 -0700 Subject: [PATCH 01/25] Add PartitionedFileLoader --- .../Named/Year=2017/Month=01/data1.csv | 2 + .../Named/Year=2017/Month=01/data2.csv | 3 + .../Named/Year=2017/Month=01/dataEmpty.csv | 1 + .../Named/Year=2017/Month=02/data1.csv | 4 + .../Named/Year=2017/TestBadDir/data1.csv | 4 + Samples/Partitioned/Unnamed/2017/01/data1.csv | 2 + Samples/Partitioned/Unnamed/2017/01/data2.csv | 3 + .../Unnamed/2017/01/dataBadSchema.csv | 3 + Samples/Partitioned/Unnamed/2017/02/data1.csv | 4 + .../PartitionedNamedDirectories-Data.txt | 14 + .../PartitionedNamedDirectories-Schema.txt | 5 + .../PartitionedUnnamedDirectories-Data.txt | 16 + .../PartitionedUnnamedDirectories-Schema.txt | 5 + .../PartitionedNamedDirectories-Data.txt | 14 + .../PartitionedNamedDirectories-Schema.txt | 5 + .../PartitionedUnnamedDirectories-Data.txt | 16 + .../PartitionedUnnamedDirectories-Schema.txt | 5 + src/Microsoft.ML.Core/Utilities/PathUtils.cs | 66 +- .../DataLoadSave/PartitionedFileLoader.cs | 726 ++++++++++++++++++ .../DataLoadSave/PartitionedPathParser.cs | 385 ++++++++++ .../DataView/CompositeSchema.cs | 119 +++ src/Microsoft.ML.Data/DataView/ZipDataView.cs | 125 +-- .../PartitionedFileLoaderTests.cs | 54 ++ 23 files changed, 1457 insertions(+), 124 deletions(-) create mode 100644 Samples/Partitioned/Named/Year=2017/Month=01/data1.csv create mode 100644 Samples/Partitioned/Named/Year=2017/Month=01/data2.csv create mode 100644 Samples/Partitioned/Named/Year=2017/Month=01/dataEmpty.csv create mode 100644 Samples/Partitioned/Named/Year=2017/Month=02/data1.csv create mode 100644 Samples/Partitioned/Named/Year=2017/TestBadDir/data1.csv create mode 100644 Samples/Partitioned/Unnamed/2017/01/data1.csv create mode 100644 Samples/Partitioned/Unnamed/2017/01/data2.csv create mode 100644 Samples/Partitioned/Unnamed/2017/01/dataBadSchema.csv create mode 100644 Samples/Partitioned/Unnamed/2017/02/data1.csv create mode 100644 ZBaselines/SingleDebug/SavePipe/PartitionedNamedDirectories-Data.txt create mode 100644 ZBaselines/SingleDebug/SavePipe/PartitionedNamedDirectories-Schema.txt create mode 100644 ZBaselines/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Data.txt create mode 100644 ZBaselines/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Schema.txt create mode 100644 ZBaselines/SingleRelease/PartitionedLoader/PartitionedNamedDirectories-Data.txt create mode 100644 ZBaselines/SingleRelease/PartitionedLoader/PartitionedNamedDirectories-Schema.txt create mode 100644 ZBaselines/SingleRelease/PartitionedLoader/PartitionedUnnamedDirectories-Data.txt create mode 100644 ZBaselines/SingleRelease/PartitionedLoader/PartitionedUnnamedDirectories-Schema.txt create mode 100644 src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs create mode 100644 src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs create mode 100644 src/Microsoft.ML.Data/DataView/CompositeSchema.cs create mode 100644 test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs diff --git a/Samples/Partitioned/Named/Year=2017/Month=01/data1.csv b/Samples/Partitioned/Named/Year=2017/Month=01/data1.csv new file mode 100644 index 0000000000..c69b170df5 --- /dev/null +++ b/Samples/Partitioned/Named/Year=2017/Month=01/data1.csv @@ -0,0 +1,2 @@ +col1, col2 +0, 1 diff --git a/Samples/Partitioned/Named/Year=2017/Month=01/data2.csv b/Samples/Partitioned/Named/Year=2017/Month=01/data2.csv new file mode 100644 index 0000000000..23cb4a7b19 --- /dev/null +++ b/Samples/Partitioned/Named/Year=2017/Month=01/data2.csv @@ -0,0 +1,3 @@ +col1, col2 +4, 5 +6, 7 \ No newline at end of file diff --git a/Samples/Partitioned/Named/Year=2017/Month=01/dataEmpty.csv b/Samples/Partitioned/Named/Year=2017/Month=01/dataEmpty.csv new file mode 100644 index 0000000000..d55d30cf89 --- /dev/null +++ b/Samples/Partitioned/Named/Year=2017/Month=01/dataEmpty.csv @@ -0,0 +1 @@ +col1, col2 \ No newline at end of file diff --git a/Samples/Partitioned/Named/Year=2017/Month=02/data1.csv b/Samples/Partitioned/Named/Year=2017/Month=02/data1.csv new file mode 100644 index 0000000000..6d80a7f679 --- /dev/null +++ b/Samples/Partitioned/Named/Year=2017/Month=02/data1.csv @@ -0,0 +1,4 @@ +col1, col2 +21, 22 +23, 24 +25, 26 diff --git a/Samples/Partitioned/Named/Year=2017/TestBadDir/data1.csv b/Samples/Partitioned/Named/Year=2017/TestBadDir/data1.csv new file mode 100644 index 0000000000..6d80a7f679 --- /dev/null +++ b/Samples/Partitioned/Named/Year=2017/TestBadDir/data1.csv @@ -0,0 +1,4 @@ +col1, col2 +21, 22 +23, 24 +25, 26 diff --git a/Samples/Partitioned/Unnamed/2017/01/data1.csv b/Samples/Partitioned/Unnamed/2017/01/data1.csv new file mode 100644 index 0000000000..c69b170df5 --- /dev/null +++ b/Samples/Partitioned/Unnamed/2017/01/data1.csv @@ -0,0 +1,2 @@ +col1, col2 +0, 1 diff --git a/Samples/Partitioned/Unnamed/2017/01/data2.csv b/Samples/Partitioned/Unnamed/2017/01/data2.csv new file mode 100644 index 0000000000..23cb4a7b19 --- /dev/null +++ b/Samples/Partitioned/Unnamed/2017/01/data2.csv @@ -0,0 +1,3 @@ +col1, col2 +4, 5 +6, 7 \ No newline at end of file diff --git a/Samples/Partitioned/Unnamed/2017/01/dataBadSchema.csv b/Samples/Partitioned/Unnamed/2017/01/dataBadSchema.csv new file mode 100644 index 0000000000..43dbd9e3f6 --- /dev/null +++ b/Samples/Partitioned/Unnamed/2017/01/dataBadSchema.csv @@ -0,0 +1,3 @@ +col1 +11 +12 diff --git a/Samples/Partitioned/Unnamed/2017/02/data1.csv b/Samples/Partitioned/Unnamed/2017/02/data1.csv new file mode 100644 index 0000000000..6d80a7f679 --- /dev/null +++ b/Samples/Partitioned/Unnamed/2017/02/data1.csv @@ -0,0 +1,4 @@ +col1, col2 +21, 22 +23, 24 +25, 26 diff --git a/ZBaselines/SingleDebug/SavePipe/PartitionedNamedDirectories-Data.txt b/ZBaselines/SingleDebug/SavePipe/PartitionedNamedDirectories-Data.txt new file mode 100644 index 0000000000..68888f43f4 --- /dev/null +++ b/ZBaselines/SingleDebug/SavePipe/PartitionedNamedDirectories-Data.txt @@ -0,0 +1,14 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=L0:TX:0 +#@ col=Year:TX:1 +#@ col=Month:TX:2 +#@ } +L0 Year Month +0 2017 01 +4 2017 01 +6 2017 01 +21 2017 02 +23 2017 02 +25 2017 02 diff --git a/ZBaselines/SingleDebug/SavePipe/PartitionedNamedDirectories-Schema.txt b/ZBaselines/SingleDebug/SavePipe/PartitionedNamedDirectories-Schema.txt new file mode 100644 index 0000000000..0220433ff0 --- /dev/null +++ b/ZBaselines/SingleDebug/SavePipe/PartitionedNamedDirectories-Schema.txt @@ -0,0 +1,5 @@ +---- PartitionedFileLoader ---- +3 columns: + L0: Text + Year: Text + Month: Text diff --git a/ZBaselines/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Data.txt b/ZBaselines/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Data.txt new file mode 100644 index 0000000000..103029123f --- /dev/null +++ b/ZBaselines/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Data.txt @@ -0,0 +1,16 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=L0:I4:0 +#@ col=Month:I4:1 +#@ col=Path:TX:2 +#@ } +L0 Month Path +1 1 2017\01\data1.csv +5 1 2017\01\data2.csv +7 1 2017\01\data2.csv +0 1 2017\01\dataBadSchema.csv +0 1 2017\01\dataBadSchema.csv +22 2 2017\02\data1.csv +24 2 2017\02\data1.csv +26 2 2017\02\data1.csv diff --git a/ZBaselines/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Schema.txt b/ZBaselines/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Schema.txt new file mode 100644 index 0000000000..5eeac7698b --- /dev/null +++ b/ZBaselines/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Schema.txt @@ -0,0 +1,5 @@ +---- PartitionedFileLoader ---- +3 columns: + L0: I4 + Month: I4 + Path: Text diff --git a/ZBaselines/SingleRelease/PartitionedLoader/PartitionedNamedDirectories-Data.txt b/ZBaselines/SingleRelease/PartitionedLoader/PartitionedNamedDirectories-Data.txt new file mode 100644 index 0000000000..68888f43f4 --- /dev/null +++ b/ZBaselines/SingleRelease/PartitionedLoader/PartitionedNamedDirectories-Data.txt @@ -0,0 +1,14 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=L0:TX:0 +#@ col=Year:TX:1 +#@ col=Month:TX:2 +#@ } +L0 Year Month +0 2017 01 +4 2017 01 +6 2017 01 +21 2017 02 +23 2017 02 +25 2017 02 diff --git a/ZBaselines/SingleRelease/PartitionedLoader/PartitionedNamedDirectories-Schema.txt b/ZBaselines/SingleRelease/PartitionedLoader/PartitionedNamedDirectories-Schema.txt new file mode 100644 index 0000000000..0220433ff0 --- /dev/null +++ b/ZBaselines/SingleRelease/PartitionedLoader/PartitionedNamedDirectories-Schema.txt @@ -0,0 +1,5 @@ +---- PartitionedFileLoader ---- +3 columns: + L0: Text + Year: Text + Month: Text diff --git a/ZBaselines/SingleRelease/PartitionedLoader/PartitionedUnnamedDirectories-Data.txt b/ZBaselines/SingleRelease/PartitionedLoader/PartitionedUnnamedDirectories-Data.txt new file mode 100644 index 0000000000..103029123f --- /dev/null +++ b/ZBaselines/SingleRelease/PartitionedLoader/PartitionedUnnamedDirectories-Data.txt @@ -0,0 +1,16 @@ +#@ TextLoader{ +#@ header+ +#@ sep=tab +#@ col=L0:I4:0 +#@ col=Month:I4:1 +#@ col=Path:TX:2 +#@ } +L0 Month Path +1 1 2017\01\data1.csv +5 1 2017\01\data2.csv +7 1 2017\01\data2.csv +0 1 2017\01\dataBadSchema.csv +0 1 2017\01\dataBadSchema.csv +22 2 2017\02\data1.csv +24 2 2017\02\data1.csv +26 2 2017\02\data1.csv diff --git a/ZBaselines/SingleRelease/PartitionedLoader/PartitionedUnnamedDirectories-Schema.txt b/ZBaselines/SingleRelease/PartitionedLoader/PartitionedUnnamedDirectories-Schema.txt new file mode 100644 index 0000000000..5eeac7698b --- /dev/null +++ b/ZBaselines/SingleRelease/PartitionedLoader/PartitionedUnnamedDirectories-Schema.txt @@ -0,0 +1,5 @@ +---- PartitionedFileLoader ---- +3 columns: + L0: I4 + Month: I4 + Path: Text diff --git a/src/Microsoft.ML.Core/Utilities/PathUtils.cs b/src/Microsoft.ML.Core/Utilities/PathUtils.cs index e1129bf9c5..6919f5fd29 100644 --- a/src/Microsoft.ML.Core/Utilities/PathUtils.cs +++ b/src/Microsoft.ML.Core/Utilities/PathUtils.cs @@ -1,9 +1,11 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. using System; +using System.Collections.Generic; using System.IO; +using System.Linq; using System.Threading; namespace Microsoft.ML.Runtime.Internal.Utilities @@ -67,13 +69,13 @@ public static string FindExistentFileOrNull(string fileName, string folderPrefix // 1. Search in customSearchDir. if (!string.IsNullOrWhiteSpace(customSearchDir) && TryFindFile(fileName, folderPrefix, customSearchDir, out candidate)) - return candidate; + return candidate; // 2. Search in the path specified by the environment variable. var envDir = Environment.GetEnvironmentVariable(CustomSearchDirEnvVariable); if (!string.IsNullOrWhiteSpace(envDir) && TryFindFile(fileName, folderPrefix, envDir, out candidate)) - return candidate; + return candidate; // 3. Search in the path specified by the assemblyForBasePath. if (assemblyForBasePath != null) @@ -139,5 +141,63 @@ public static string CreateFolderIfNotExists(string folder) return null; } + + /// + /// Make a full path realtive to a base path. + /// + /// The base path, assumed to be a directory. + /// The full path. + /// The relative path. + /// If the paths are not relative. + public static string MakePathRelative(string basepath, string path) + { + Contracts.AssertNonEmpty(basepath); + Contracts.AssertNonEmpty(path); + + Uri baseUri = new Uri(basepath); + Uri uri = new Uri(path); + + if (baseUri.Scheme != uri.Scheme) + { + throw new ArgumentException("Paths cannot be made relative as they are of different schemas."); + } + + string relativePath; + try + { + if (!baseUri.AbsoluteUri.EndsWith("/")) + { + baseUri = new Uri(baseUri.AbsoluteUri + "/"); + } + + relativePath = baseUri.MakeRelativeUri(uri).ToString(); + } + catch (ArgumentNullException e) + { + throw new ArgumentException("Paths could not be made relative.", e); + } + catch (InvalidOperationException e) + { + throw new ArgumentException("Paths could not be made relative.", e); + } + + if (uri.Scheme.Equals("file", StringComparison.InvariantCultureIgnoreCase)) + { + relativePath = relativePath.Replace(Path.AltDirectorySeparatorChar, Path.DirectorySeparatorChar); + } + + return relativePath; + } + + /// + /// Split a path string into an enumerable list of the directories. + /// + /// The path string to split. + /// An enumerable list of all non-empty directories. + public static IEnumerable SplitDirectories(string path) + { + var cleanPath = path.Replace(Path.AltDirectorySeparatorChar, Path.DirectorySeparatorChar); + return cleanPath.Split(Path.DirectorySeparatorChar).Where(dir => !String.IsNullOrEmpty(dir)); + } } } diff --git a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs new file mode 100644 index 0000000000..5b82490d14 --- /dev/null +++ b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs @@ -0,0 +1,726 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Data.Conversion; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Model; + +[assembly: LoadableClass(PartitionedFileLoader.Summary, typeof(PartitionedFileLoader), typeof(PartitionedFileLoader.Arguments), typeof(SignatureDataLoader), + PartitionedFileLoader.UserName, PartitionedFileLoader.LoadName, PartitionedFileLoader.ShortName)] + +[assembly: LoadableClass(PartitionedFileLoader.Summary, typeof(PartitionedFileLoader), null, typeof(SignatureLoadDataLoader), + PartitionedFileLoader.UserName, PartitionedFileLoader.LoadName, PartitionedFileLoader.ShortName)] + +namespace Microsoft.ML.Runtime.Data +{ + /// + /// Loads a set of directory partitioned files into an IDataView. + /// The directories of the file will treated as column data and the underlying files are loaded using the data loader. + /// The first file will be used as the basis for all follow-up file paths and schemas. Any files that don't match + /// the expected path or schema will be skipped. + /// + /// + /// Sample directory structure: + /// + /// Data/ + /// Year=2017/ + /// Month=01/ + /// data1.parquet + /// data1.parquet + /// Month=02/ + /// data1.parquet + /// data1.parquet + /// Year=2018/ + /// Month=01/ + /// data1.parquet + /// data1.parquet + /// + public sealed class PartitionedFileLoader : IDataLoader + { + internal const string Summary = "Loads a horizontally partitioned file set."; + internal const string UserName = "Partitioned Loader"; + public const string LoadName = "PartitionedLoader"; + public const string ShortName = "Part"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "PARTLOAD", + verWrittenCur: 0x00010001, // Initial + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoadName); + } + + public class Arguments + { + [Argument(ArgumentType.Required, HelpText = "Base path to the directory of your partitioned files.", ShortName = "bp")] + public string BasePath; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Append a column with the file path.", ShortName = "path")] + public bool IncludePathColumn = false; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Path parser to extract column name/value pairs from the file path.", ShortName = "parser")] + public IPartitionedPathParserFactory PathParserFactory = new ParquetPartitionedPathParserFactory(); + + [Argument(ArgumentType.Multiple, HelpText = "The data loader.")] + public SubComponent Loader; + } + + public sealed class Column + { + [Argument(ArgumentType.Required, HelpText = "Name of the column.")] + public string Name; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Data type of the column.")] + public DataKind? Type; + + [Argument(ArgumentType.Required, HelpText = "Source index of the column.")] + public int Source; + + public static Column Parse(string str) + { + Contracts.AssertNonEmpty(str); + + if (TryParse(str, out Column column)) + { + return column; + } + + return null; + } + + public static bool TryParse(string str, out Column column) + { + column = null; + + if (string.IsNullOrEmpty(str)) + { + return false; + } + + if (!ColumnParsingUtils.TryParse(str, out string name, out string sourceStr, out string kindStr)) + { + return false; + } + + DataKind? kind = null; + if (kindStr != null && TypeParsingUtils.TryParseDataKind(kindStr, out DataKind parsedKind, out KeyRange range)) + { + kind = parsedKind; + } + + if (!int.TryParse(sourceStr, out int source)) + { + return false; + } + + column = new Column() + { + Name = name, + Source = source, + Type = kind + }; + + return true; + } + + public bool TryUnparse(StringBuilder sb) + { + Contracts.AssertValue(sb); + + sb.Append($"{Name}"); + + if (Type.HasValue) + { + sb.Append($":{Type}"); + } + + sb.Append($":{Source}"); + + return true; + } + } + + private readonly IHost _host; + private readonly IMultiStreamSource _files; + private readonly Column[] _columns; + + // Number of tailing directories to include. + private readonly int _tailingDirCount; + + // An underlying loader used on each individual loader. + private readonly SubComponent _subLoader; + + private readonly IPartitionedPathParser _pathParser; + + private const string RegistrationName = LoadName; + + public PartitionedFileLoader(IHostEnvironment env, Arguments args, IMultiStreamSource files) + { + Contracts.CheckValue(env, nameof(env)); + _host = env.Register(RegistrationName); + _host.CheckValue(args, nameof(args)); + _host.CheckValue(files, nameof(files)); + + _pathParser = args.PathParserFactory.CreateComponent(_host); + _host.CheckValue(_pathParser, nameof(_pathParser), "Factory failed to create a FilePathSpec"); + + _subLoader = args.Loader; + _files = files; + + string relativePath = GetRelativePath(args.BasePath, files); + _columns = ParseColumns(relativePath).ToArray(); + _tailingDirCount = GetDirectoryCount(relativePath); + + if (args.IncludePathColumn) + { + var pathCol = new Column() + { + Name = "Path", + Source = -1, + Type = DataKind.Text + }; + + _columns = _columns.Concat(new[] { pathCol }).ToArray(); + } + + Schema = CreateSchema(_host, _columns, _subLoader); + } + + private PartitionedFileLoader(IHost host, ModelLoadContext ctx, IMultiStreamSource files) + { + Contracts.AssertValue(host); + _host = host; + _host.AssertValue(ctx); + _host.AssertValue(files); + + // ** Binary format ** + // int: tailing directory count + // int: number of columns + // foreach column: + // string: column representation + // string: subloader + // model: file path spec + + _tailingDirCount = ctx.Reader.ReadInt32(); + + int numColumns = ctx.Reader.ReadInt32(); + _host.CheckDecode(numColumns >= 0); + + _columns = new Column[numColumns]; + for (int i = 0; i < numColumns; i++) + { + var column = Column.Parse(ctx.LoadString()); + _host.CheckDecode(column != null); + _columns[i] = column; + } + + var loader = SubComponent.Parse(ctx.LoadString()); + _subLoader = new SubComponent(loader.Kind, loader.Settings); + + ctx.LoadModel(_host, out _pathParser, "FilePathSpec"); + + _files = files; + Schema = CreateSchema(_host, _columns, _subLoader); + } + + public static PartitionedFileLoader Create(IHostEnvironment env, ModelLoadContext ctx, IMultiStreamSource files) + { + Contracts.CheckValue(env, nameof(env)); + IHost host = env.Register(RegistrationName); + + env.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + env.CheckValue(files, nameof(files)); + + return host.Apply("Loading Model", + ch => new PartitionedFileLoader(host, ctx, files)); + } + + public void Save(ModelSaveContext ctx) + { + Contracts.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(); + ctx.SetVersionInfo(GetVersionInfo()); + + // ** Binary format ** + // int: tailing directory count + // int: number of columns + // foreach column: + // string: column representation + // string: subloader + // model: file path spec + + ctx.Writer.Write(_tailingDirCount); + + ctx.Writer.Write(_columns.Length); + StringBuilder sb = new StringBuilder(); + foreach (var col in _columns) + { + sb.Clear(); + _host.Check(col.TryUnparse(sb)); + ctx.SaveString(sb.ToString()); + } + + ctx.SaveString(_subLoader.ToString()); + ctx.SaveModel(_pathParser, "FilePathSpec"); + } + + public bool CanShuffle => true; + + public ISchema Schema { get; } + + private ISchema SubSchema { get; set; } + + public long? GetRowCount(bool lazy = true) + { + return null; + } + + public IRowCursor GetRowCursor(Func needCol, IRandom rand = null) + { + return new Cursor(_host, this, _files, needCol, rand); + } + + public IRowCursor[] GetRowCursorSet(out IRowCursorConsolidator consolidator, Func needCol, int n, IRandom rand = null) + { + consolidator = null; + var cursor = new Cursor(_host, this, _files, needCol, rand); + return new IRowCursor[] { cursor }; + } + + /// + /// Create a composite schema of both the partitioned columns and the underlying loader columns. + /// + /// The exception context. + /// The partitioned columns. + /// The sub loader. + /// The resulting schema. + private ISchema CreateSchema(IExceptionContext ectx, Column[] cols, SubComponent subComponent) + { + Contracts.AssertValue(cols); + Contracts.AssertValue(subComponent); + + var columnNameTypes = cols.Select((col) => new KeyValuePair(col.Name, PrimitiveType.FromKind(col.Type.Value))); + var colSchema = new SimpleSchema(ectx, columnNameTypes.ToArray()); + + SubSchema = subComponent.CreateInstance(_host, _files).Schema; + + if (SubSchema.ColumnCount == 0) + { + return colSchema; + } + else + { + var schemas = new ISchema[] + { + SubSchema, + colSchema + }; + + return new CompositeSchema(schemas); + } + } + + private sealed class Cursor : RootCursorBase, IRowCursor + { + private PartitionedFileLoader _parent; + + private bool[] _active; + private bool[] _subActive; // Active columns of the sub-cursor. + private Delegate[] _getters; + private Delegate[] _subGetters; // Cached getters of the sub-cursor. + + private DvText[] _colValues; // Column values cached from the file path. + private IRowCursor _subCursor; // Sub cursor of the current file. + + private IEnumerator _fileOrder; + + public Cursor(IChannelProvider provider, PartitionedFileLoader parent, IMultiStreamSource files, Func predicate, IRandom rand) + : base(provider) + { + Contracts.AssertValue(parent); + Contracts.AssertValue(files); + Contracts.AssertValue(predicate); + + _parent = parent; + + _active = Utils.BuildArray(Schema.ColumnCount, predicate); + _subActive = _active.Take(SubColumnCount).ToArray(); + _colValues = new DvText[_parent._columns.Length]; + + _subGetters = new Delegate[SubColumnCount]; + _getters = CreateGetters(); + + _fileOrder = CreateFileOrder(rand).GetEnumerator(); + } + + public override long Batch => 0; + + public ISchema Schema => _parent.Schema; + + public ValueGetter GetGetter(int col) + { + Ch.Check(IsColumnActive(col)); + + var getter = _getters[col] as ValueGetter; + if (getter == null) + { + throw Ch.Except("Invalid TValue: '{0}'", typeof(TValue)); + } + + return getter; + } + + public override ValueGetter GetIdGetter() + { + return + (ref UInt128 val) => + { + Ch.Check(IsGood, "Cannot call ID getter in current state"); + + val = new UInt128(0, (ulong)Position); + }; + } + + public bool IsColumnActive(int col) + { + Ch.Check(0 <= col && col < Schema.ColumnCount); + return _active[col]; + } + + protected override bool MoveNextCore() + { + // Iterate sub cursor or move to the next file. + while (_subCursor == null || !_subCursor.MoveNext()) + { + // Cleanup old sub cursor + if (_subCursor != null) + { + _subCursor.Dispose(); + _subCursor = null; + } + + if (!TryGetNextPathAndValues(out string path, out string relativePath, out List values)) + { + return false; + } + + // Load the sub cursor and reset the data. + var loader = _parent._subLoader.CreateInstance(_parent._host, new MultiFileSource(path)); + + if (!SchemasMatch(_parent.SubSchema, loader.Schema)) + { + Ch.Warning($"Schema of file {path} does not match."); + continue; + } + + _subCursor = loader.GetRowCursor(col => _subActive[col]); + + try + { + UpdateSubGetters(); + UpdateColumnValues(relativePath, values); + } + catch (FormatException e) + { + // Failed to load this file so skip. + Ch.Warning(MessageSensitivity.Schema, e.Message); + if (_subCursor != null) + { + _subCursor.Dispose(); + _subCursor = null; + } + } + } + + return true; + } + + private bool TryGetNextPathAndValues(out string path, out string relativePath, out List values) + { + path = null; + relativePath = null; + values = null; + + do + { + // No more files to load. + if (!_fileOrder.MoveNext()) + { + return false; + } + + // Get next file and parse the column values from the file path. + string curPath = _parent._files.GetPathOrNull(_fileOrder.Current); + if (String.IsNullOrEmpty(curPath)) + { + Ch.Warning($"File at index {_fileOrder.Current} is missing a path. Loading of file is being skipped."); + continue; + } + + if (!TryTruncatePath(_parent._tailingDirCount, curPath, out relativePath)) + { + continue; + } + + if (!TryParseValuesFromPath(relativePath, out values)) + { + continue; + } + + path = curPath; + + } while (String.IsNullOrEmpty(path)); + + return true; + } + + private void UpdateSubGetters() + { + // Reset getters for the subcursor. + for (int i = 0; i < SubColumnCount; i++) + { + if (_subActive[i]) + { + var type = _parent.SubSchema.GetColumnType(i); + _subGetters[i] = MarshalGetter(_subCursor.GetGetter, type.RawType, i); + } + } + } + + private void UpdateColumnValues(string path, List values) + { + // Cache the column values for future Getter calls. + for (int i = 0; i < _colValues.Length; i++) + { + var col = _parent._columns[i]; + + var source = col.Source; + if (source >= 0 && source < values.Count) + { + _colValues[i] = new DvText(values[source]); + } + else if (source == -1) + { + _colValues[i] = new DvText(path); + } + } + } + + private Delegate[] CreateGetters() + { + Delegate[] getters = new Delegate[Schema.ColumnCount]; + for (int i = 0; i < getters.Length; i++) + { + if (!_active[i]) + { + continue; + } + + var type = Schema.GetColumnType(i); + + // Use sub-cursor for all sub-columns. + if (IsSubColumn(i)) + { + getters[i] = Utils.MarshalInvoke(CreateSubGetterDelegateCore, type.RawType, i); + } + else + { + int idx = i - SubColumnCount; + getters[i] = Utils.MarshalInvoke(CreateGetterDelegateCore, type.RawType, idx, type); + } + } + + return getters; + } + + private Delegate CreateSubGetterDelegateCore(int col) + { + return (Delegate)SubGetterDelegateCore(col); + } + + private ValueGetter SubGetterDelegateCore(int col) + { + Ch.Check(col >= 0 && col < SubColumnCount); + + return (ref TValue value) => + { + // SubCursor may change so always requery the getter. + ValueGetter getter = _subGetters[col] as ValueGetter; + getter?.Invoke(ref value); + }; + } + + private Delegate CreateGetterDelegateCore(int col, ColumnType type) + { + return (Delegate)GetterDelegateCore(col, type); + } + + private ValueGetter GetterDelegateCore(int col, ColumnType type) + { + Ch.Check(col >= 0 && col < _colValues.Length); + Ch.AssertValue(type); + + var conv = Conversions.Instance.GetStandardConversion(TextType.Instance, type) as ValueMapper; + if (conv == null) + { + throw Ch.Except("Invalid TValue: '{0}' of the conversion.", typeof(TValue)); + } + + return (ref TValue value) => + { + conv(ref _colValues[col], ref value); + }; + } + + private bool IsSubColumn(int col) + { + return col < SubColumnCount; + } + + private int SubColumnCount => Schema.ColumnCount - _parent._columns.Length; + + private IEnumerable CreateFileOrder(IRandom rand) + { + if (rand == null) + { + return Enumerable.Range(0, _parent._files.Count); + } + else + { + return Utils.GetRandomPermutation(rand, _parent._files.Count); + } + } + + private bool SchemasMatch(ISchema schema1, ISchema schema2) + { + if (schema1.ColumnCount != schema2.ColumnCount) + { + return false; + } + + int colLim = schema1.ColumnCount; + for (int col = 0; col < colLim; col++) + { + var type1 = schema1.GetColumnType(col); + var type2 = schema2.GetColumnType(col); + if (!type1.Equals(type2)) + { + return false; + } + } + + return true; + } + + private Delegate MarshalGetter(Func> func, Type type, int col) + { + var returnType = typeof(ValueGetter<>).MakeGenericType(type); + var meth = func.Method; + + var typedMeth = meth.GetGenericMethodDefinition().MakeGenericMethod(type); + return (Delegate)typedMeth.Invoke(func.Target, new object[] { col }); + } + + /// + /// Truncate path to the specified number of trailing directories. + /// + /// Number of directories to retain. + /// Path to truncate. + /// The resulting truncated path. + /// true if the truncation was successful. + private bool TryTruncatePath(int dirCount, string path, out string truncPath) + { + truncPath = null; + + // Remove directories that shouldn't be parsed. + var segments = Utils.SplitDirectories(path); + segments = segments.Skip(segments.Count() - dirCount - 1); + + if (segments.Count() < dirCount - 1) + { + Ch.Warning($"Path {path} did not have {dirCount} directories necessary for parsing."); + return false; + } + + // Rejoin segments to create a valid path. + truncPath = String.Join(Path.DirectorySeparatorChar.ToString(), segments); + return true; + } + + + /// + /// Parse all column values from the directory path. + /// + /// The directory path to parse for name/value pairs. + /// The resulting name value pairs. + /// true if the parsing was successfull. + private bool TryParseValuesFromPath(string path, out List results) + { + Contracts.CheckNonWhiteSpace(path, nameof(path)); + + results = null; + + try + { + results = _parent._pathParser.ParseValues(path).ToList(); + return true; + } + catch (FormatException e) + { + Ch.Warning($"Could not parse column values from the path {path}.", e); + results = null; + return false; + } + } + } + + /// + /// Get a path relative to the base path. + /// + /// A base path. + /// A list of files under the base path. + /// A realtive file path. + private string GetRelativePath(string basepath, IMultiStreamSource files) + { + Contracts.CheckNonEmpty(basepath, nameof(basepath)); + + string path = files.GetPathOrNull(0); + _host.CheckNonEmpty(path, nameof(path)); + + var relativePath = Utils.MakePathRelative(basepath, path); + return relativePath; + } + + /// + /// Parse the column definitions using a path parser. + /// + /// The path to a file. + /// The resulting Columns. + private IEnumerable ParseColumns(string path) + { + return _pathParser.ParseColumns(path).ToArray(); + } + + /// + /// Get the number of directories in the file path. + /// + /// A file path. + /// The number of directories + private int GetDirectoryCount(string path) + { + return Utils.SplitDirectories(path).Count() - 1; + } + } +} diff --git a/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs b/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs new file mode 100644 index 0000000000..6a2d06baf4 --- /dev/null +++ b/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs @@ -0,0 +1,385 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Web; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.EntryPoints; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Model; + +[assembly: LoadableClass(SimplePartitionedPathParser.Summary, typeof(SimplePartitionedPathParser), typeof(SimplePartitionedPathParser.Arguments), typeof(PartitionedPathParser), + SimplePartitionedPathParser.UserName, SimplePartitionedPathParser.LoadName, SimplePartitionedPathParser.ShortName)] +[assembly: LoadableClass(ParquetPartitionedPathParser.Summary, typeof(ParquetPartitionedPathParser), null, typeof(PartitionedPathParser), + ParquetPartitionedPathParser.UserName, ParquetPartitionedPathParser.LoadName, ParquetPartitionedPathParser.ShortName)] + +// This is for deserialization +[assembly: LoadableClass(SimplePartitionedPathParser.Summary, typeof(SimplePartitionedPathParser), null, typeof(SignatureLoadModel), + SimplePartitionedPathParser.UserName, SimplePartitionedPathParser.LoadName, SimplePartitionedPathParser.ShortName)] +[assembly: LoadableClass(ParquetPartitionedPathParser.Summary, typeof(ParquetPartitionedPathParser), null, typeof(SignatureLoadModel), + ParquetPartitionedPathParser.UserName, ParquetPartitionedPathParser.LoadName, ParquetPartitionedPathParser.ShortName)] + +[assembly: EntryPointModule(typeof(SimplePartitionedPathParser.Arguments))] +[assembly: EntryPointModule(typeof(ParquetPartitionedPathParserFactory))] + +namespace Microsoft.ML.Runtime.Data +{ + /// + /// Delegate signature for a partitioned path parser. + /// + public delegate void PartitionedPathParser(); + + /// + /// Supports extracting column names and values from a path string. + /// + public interface IPartitionedPathParser + { + /// + /// Extract the column definitions from a file path. + /// + /// The file path. + /// The resulting column definitions. + /// Thrown when parsing fails. + IEnumerable ParseColumns(string path); + + /// + /// Extract the column values from a file path. + /// + /// The file path. + /// The resulting column values. + /// Thrown when parsing fails. + IEnumerable ParseValues(string path); + } + + [TlcModule.ComponentKind("PartitionedPathParser")] + public interface IPartitionedPathParserFactory : IComponentFactory + { + new IPartitionedPathParser CreateComponent(IHostEnvironment env); + } + + public sealed class SimplePartitionedPathParser : IPartitionedPathParser, ICanSaveModel + { + internal const string Summary = "A simple parser that extracts directory names as column values. Column names are defined as arguments."; + internal const string UserName = "Simple Partitioned Path Parser"; + public const string LoadName = "SimplePathParser"; + public const string ShortName = "SmplPP"; + + [TlcModule.Component(Name = SimplePartitionedPathParser.LoadName, FriendlyName = SimplePartitionedPathParser.UserName, + Desc = SimplePartitionedPathParser.Summary, Alias = SimplePartitionedPathParser.ShortName)] + public class Arguments : IPartitionedPathParserFactory + { + [Argument(ArgumentType.Multiple, HelpText = "Column definitions used to override the Partitioned Path Parser. Expected with the format name:type:numeric-source, e.g. col=MyFeature:R4:1", + ShortName = "col", SortOrder = 1)] + public Microsoft.ML.Runtime.Data.PartitionedFileLoader.Column[] Columns; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Data type of each column.")] + public DataKind? Type = DataKind.Text; + + public IPartitionedPathParser CreateComponent(IHostEnvironment env) => new SimplePartitionedPathParser(env, this); + } + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "SMPLPARS", + verWrittenCur: 0x00010001, // Initial + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoadName); + } + + private IHost _host; + private PartitionedFileLoader.Column[] _columns; + + public SimplePartitionedPathParser(IHostEnvironment env, Arguments args) + { + _host = env.Register(LoadName); + + _columns = args.Columns; + foreach (var col in _columns) + { + if (!col.Type.HasValue) + { + col.Type = args.Type.HasValue ? args.Type : DataKind.Text; + } + } + } + + private SimplePartitionedPathParser(IHost host, ModelLoadContext ctx) + { + Contracts.AssertValue(host); + _host = host; + _host.AssertValue(ctx); + + // ** Binary format ** + // int: number of columns + // foreach column: + // string: column representation + + int numColumns = ctx.Reader.ReadInt32(); + _host.CheckDecode(numColumns >= 0); + + _columns = new PartitionedFileLoader.Column[numColumns]; + for (int i = 0; i < numColumns; i++) + { + var column = PartitionedFileLoader.Column.Parse(ctx.LoadString()); + _host.CheckDecode(column != null); + _columns[i] = column; + } + } + + public static SimplePartitionedPathParser Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(ctx, nameof(ctx)); + IHost host = env.Register(LoadName); + ctx.CheckAtModel(GetVersionInfo()); + + return host.Apply("Loading Parser", + ch => new SimplePartitionedPathParser(host, ctx)); + } + + public void Save(ModelSaveContext ctx) + { + Contracts.CheckValue(ctx, nameof(ctx)); + ctx.SetVersionInfo(GetVersionInfo()); + + // ** Binary format ** + // int: number of columns + // foreach column: + // string: column representation + + ctx.Writer.Write(_columns.Length); + StringBuilder sb = new StringBuilder(); + foreach (var col in _columns) + { + sb.Clear(); + _host.Check(col.TryUnparse(sb)); + ctx.SaveString(sb.ToString()); + } + } + + public IEnumerable ParseColumns(string path) + { + Contracts.AssertNonEmpty(path); + + // Verify that path matches the columns expectations. + var values = ParseValues(path); + foreach (var col in _columns) + { + if (col.Source < 0 || col.Source >= values.Count()) + { + throw new FormatException($"Column definition {col} is outside the bounds of path {path}."); + } + } + + return _columns; + } + + public IEnumerable ParseValues(string path) + { + Contracts.AssertNonEmpty(path); + + var dirs = Utils.SplitDirectories(path); + return dirs.Take(dirs.Count() - 1); // Ignore last directory which is the file name. + } + } + + [TlcModule.Component(Name = ParquetPartitionedPathParser.LoadName, FriendlyName = ParquetPartitionedPathParser.UserName, + Desc = ParquetPartitionedPathParser.Summary, Alias = ParquetPartitionedPathParser.ShortName)] + public class ParquetPartitionedPathParserFactory : IPartitionedPathParserFactory + { + public IPartitionedPathParser CreateComponent(IHostEnvironment env) => new ParquetPartitionedPathParser(); + } + + public sealed class ParquetPartitionedPathParser : IPartitionedPathParser, ICanSaveModel + { + internal const string Summary = "Extract name/value pairs from Parquet formatted directory names. Example path: Year=2018/Month=12/data1.parquet"; + internal const string UserName = "Parquet Partitioned Path Parser"; + public const string LoadName = "ParquetPathParser"; + public const string ShortName = "ParqPP"; + + private IHost _host; + private PartitionedFileLoader.Column[] _columns; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "PARQPARS", + verWrittenCur: 0x00010001, // Initial + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoadName); + } + + public ParquetPartitionedPathParser() + { + _columns = new PartitionedFileLoader.Column[0]; + } + + private ParquetPartitionedPathParser(IHost host, ModelLoadContext ctx) + { + Contracts.AssertValue(host); + _host = host; + _host.AssertValue(ctx); + + // ** Binary format ** + // int: number of columns + // foreach column: + // string: column representation + + int numColumns = ctx.Reader.ReadInt32(); + _host.CheckDecode(numColumns >= 0); + + _columns = new PartitionedFileLoader.Column[numColumns]; + for (int i = 0; i < numColumns; i++) + { + var column = PartitionedFileLoader.Column.Parse(ctx.LoadString()); + _host.CheckDecode(column != null); + _columns[i] = column; + } + } + + public static ParquetPartitionedPathParser Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(ctx, nameof(ctx)); + IHost host = env.Register(LoadName); + ctx.CheckAtModel(GetVersionInfo()); + + return host.Apply("Loading Parser", + ch => new ParquetPartitionedPathParser(host, ctx)); + } + + public void Save(ModelSaveContext ctx) + { + Contracts.CheckValue(ctx, nameof(ctx)); + ctx.SetVersionInfo(GetVersionInfo()); + + // ** Binary format ** + // int: number of columns + // foreach column: + // string: column representation + + ctx.Writer.Write(_columns.Length); + StringBuilder sb = new StringBuilder(); + foreach (var col in _columns) + { + sb.Clear(); + _host.Check(col.TryUnparse(sb)); + ctx.SaveString(sb.ToString()); + }; + } + + public IEnumerable ParseColumns(string path) + { + if (!TryParseNames(path, out List names)) + { + throw new FormatException($"Failed to parse names from path {path}. Expected directory names with the format 'Name=Value'."); + } + + _columns = new PartitionedFileLoader.Column[names.Count]; + for (int i = 0; i < names.Count; i++) + { + _columns[i] = new PartitionedFileLoader.Column() + { + Name = names[i], + Source = i, + Type = DataKind.Text + }; + } + + return _columns; + } + + public IEnumerable ParseValues(string path) + { + if (!TryParseValues(path, out List values)) + { + throw new FormatException($"Failed to parse names from path {path}. Expected directory names with the format 'Name=Value'."); + } + + if (values.Count != _columns.Length) + { + throw new FormatException($"The extracted value count of {values.Count} does not match the expected Column count of {_columns.Length} for path {path}"); + } + + return values; + } + + public bool TryParseNames(string path, out List names) + { + return TryParseNamesAndValues(path, out names, out List values); + } + + public bool TryParseValues(string path, out List values) + { + return TryParseNamesAndValues(path, out List names, out values); + } + + public bool TryParseNamesAndValues(string path, out List names, out List values) + { + names = null; + values = null; + + if (string.IsNullOrEmpty(path)) + { + return false; + } + + var dirs = Utils.SplitDirectories(path); + dirs = dirs.Take(dirs.Count() - 1); // Ignore last directory which is the file name. + + names = new List(dirs.Count()); + values = new List(dirs.Count()); + + foreach (var dir in dirs) + { + if (!TryParseNameValueFromDir(dir, out string name, out string value)) + { + return false; + } + + names.Add(name); + values.Add(value); + } + + return true; + } + + /// + /// Parse the name/value pair from a partitioned directory name. + /// + /// The directory name. + /// The resulting name. + /// The resulting value. + /// true if the parsing was successfull. + private static bool TryParseNameValueFromDir(string dir, out string name, out string value) + { + const char nameValueSeparator = '='; + + name = null; + value = null; + + if (string.IsNullOrEmpty(dir)) + { + return false; + } + + var nameValue = dir.Split(nameValueSeparator); + if (nameValue.Length != 2) + { + return false; + } + + name = nameValue[0]; + value = HttpUtility.UrlDecode(nameValue[1]); + + return true; + } + } +} diff --git a/src/Microsoft.ML.Data/DataView/CompositeSchema.cs b/src/Microsoft.ML.Data/DataView/CompositeSchema.cs new file mode 100644 index 0000000000..4d387de1d5 --- /dev/null +++ b/src/Microsoft.ML.Data/DataView/CompositeSchema.cs @@ -0,0 +1,119 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.ML.Runtime.Internal.Utilities; + +namespace Microsoft.ML.Runtime.Data +{ + /// + /// A convenience class for concatenating several schemas together. + /// This would be necessary when combining IDataViews through any type of combining operation, e.g. zip. + /// + internal sealed class CompositeSchema : ISchema + { + private readonly ISchema[] _sources; + + // Zero followed by cumulative column counts. Zero being used for the empty case. + private readonly int[] _cumulativeColCounts; + + public CompositeSchema(ISchema[] sources) + { + Contracts.AssertNonEmpty(sources); + _sources = sources; + _cumulativeColCounts = new int[_sources.Length + 1]; + _cumulativeColCounts[0] = 0; + + for (int i = 0; i < sources.Length; i++) + { + var schema = sources[i]; + _cumulativeColCounts[i + 1] = _cumulativeColCounts[i] + schema.ColumnCount; + } + } + + public int ColumnCount => _cumulativeColCounts[_cumulativeColCounts.Length - 1]; + + /// + /// Returns an array of input predicated for sources, corresponding to the input predicate. + /// The returned array size is equal to the number of sources, but if a given source is not needed at all, + /// the corresponding predicate will be null. + /// + public Func[] GetInputPredicates(Func predicate) + { + Contracts.AssertValue(predicate); + var result = new Func[_sources.Length]; + for (int i = 0; i < _sources.Length; i++) + { + var lastColCount = _cumulativeColCounts[i]; + result[i] = srcCol => predicate(srcCol + lastColCount); + } + + return result; + } + + /// + /// Checks whether the column index is in range. + /// + public void CheckColumnInRange(int col) + { + Contracts.CheckParam(0 <= col && col < _cumulativeColCounts[_cumulativeColCounts.Length - 1], nameof(col), "Column index out of range"); + } + + public void GetColumnSource(int col, out int srcIndex, out int srcCol) + { + CheckColumnInRange(col); + if (!_cumulativeColCounts.TryFindIndexSorted(0, _cumulativeColCounts.Length, col, out srcIndex)) + srcIndex--; + Contracts.Assert(0 <= srcIndex && srcIndex < _cumulativeColCounts.Length); + srcCol = col - _cumulativeColCounts[srcIndex]; + Contracts.Assert(0 <= srcCol && srcCol < _sources[srcIndex].ColumnCount); + } + + public bool TryGetColumnIndex(string name, out int col) + { + for (int i = _sources.Length; --i >= 0;) + { + if (_sources[i].TryGetColumnIndex(name, out col)) + { + col += _cumulativeColCounts[i]; + return true; + } + } + + col = -1; + return false; + } + + public string GetColumnName(int col) + { + GetColumnSource(col, out int dv, out int srcCol); + return _sources[dv].GetColumnName(srcCol); + } + + public ColumnType GetColumnType(int col) + { + GetColumnSource(col, out int dv, out int srcCol); + return _sources[dv].GetColumnType(srcCol); + } + + public IEnumerable> GetMetadataTypes(int col) + { + GetColumnSource(col, out int dv, out int srcCol); + return _sources[dv].GetMetadataTypes(srcCol); + } + + public ColumnType GetMetadataTypeOrNull(string kind, int col) + { + GetColumnSource(col, out int dv, out int srcCol); + return _sources[dv].GetMetadataTypeOrNull(kind, srcCol); + } + + public void GetMetadata(string kind, int col, ref TValue value) + { + GetColumnSource(col, out int dv, out int srcCol); + _sources[dv].GetMetadata(kind, srcCol, ref value); + } + } +} diff --git a/src/Microsoft.ML.Data/DataView/ZipDataView.cs b/src/Microsoft.ML.Data/DataView/ZipDataView.cs index d10e4f3223..9a7e79bab8 100644 --- a/src/Microsoft.ML.Data/DataView/ZipDataView.cs +++ b/src/Microsoft.ML.Data/DataView/ZipDataView.cs @@ -1,4 +1,4 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. @@ -25,7 +25,7 @@ public sealed class ZipDataView : IDataView private readonly IHost _host; private readonly IDataView[] _sources; - private readonly ZipSchema _schema; + private readonly CompositeSchema _schema; public static IDataView Create(IHostEnvironment env, IEnumerable sources) { @@ -47,7 +47,7 @@ private ZipDataView(IHost host, IDataView[] sources) _host.Assert(Utils.Size(sources) > 1); _sources = sources; - _schema = new ZipSchema(_sources.Select(x => x.Schema).ToArray()); + _schema = new CompositeSchema(_sources.Select(x => x.Schema).ToArray()); } public bool CanShuffle { get { return false; } } @@ -104,127 +104,10 @@ public IRowCursor[] GetRowCursorSet(out IRowCursorConsolidator consolidator, Fun return new IRowCursor[] { GetRowCursor(predicate, rand) }; } - /// - /// This is a result of appending several schema together. - /// - internal sealed class ZipSchema : ISchema - { - private readonly ISchema[] _sources; - // Zero followed by cumulative column counts. - private readonly int[] _cumulativeColCounts; - - public ZipSchema(ISchema[] sources) - { - Contracts.AssertNonEmpty(sources); - _sources = sources; - _cumulativeColCounts = new int[_sources.Length + 1]; - _cumulativeColCounts[0] = 0; - - for (int i = 0; i < sources.Length; i++) - { - var schema = sources[i]; - _cumulativeColCounts[i + 1] = _cumulativeColCounts[i] + schema.ColumnCount; - } - } - - /// - /// Returns an array of input predicated for sources, corresponding to the input predicate. - /// The returned array size is equal to the number of sources, but if a given source is not needed at all, - /// the corresponding predicate will be null. - /// - public Func[] GetInputPredicates(Func predicate) - { - Contracts.AssertValue(predicate); - var result = new Func[_sources.Length]; - for (int i = 0; i < _sources.Length; i++) - { - var lastColCount = _cumulativeColCounts[i]; - result[i] = srcCol => predicate(srcCol + lastColCount); - } - - return result; - } - - /// - /// Checks whether the column index is in range. - /// - public void CheckColumnInRange(int col) - { - Contracts.CheckParam(0 <= col && col < _cumulativeColCounts[_cumulativeColCounts.Length - 1], nameof(col), "Column index out of range"); - } - - public void GetColumnSource(int col, out int srcIndex, out int srcCol) - { - CheckColumnInRange(col); - if (!_cumulativeColCounts.TryFindIndexSorted(0, _cumulativeColCounts.Length, col, out srcIndex)) - srcIndex--; - Contracts.Assert(0 <= srcIndex && srcIndex < _cumulativeColCounts.Length); - srcCol = col - _cumulativeColCounts[srcIndex]; - Contracts.Assert(0 <= srcCol && srcCol < _sources[srcIndex].ColumnCount); - } - - public int ColumnCount { get { return _cumulativeColCounts[_cumulativeColCounts.Length - 1]; } } - - public bool TryGetColumnIndex(string name, out int col) - { - for (int i = _sources.Length; --i >= 0; ) - { - if (_sources[i].TryGetColumnIndex(name, out col)) - { - col += _cumulativeColCounts[i]; - return true; - } - } - - col = -1; - return false; - } - - public string GetColumnName(int col) - { - int dv; - int srcCol; - GetColumnSource(col, out dv, out srcCol); - return _sources[dv].GetColumnName(srcCol); - } - - public ColumnType GetColumnType(int col) - { - int dv; - int srcCol; - GetColumnSource(col, out dv, out srcCol); - return _sources[dv].GetColumnType(srcCol); - } - - public IEnumerable> GetMetadataTypes(int col) - { - int dv; - int srcCol; - GetColumnSource(col, out dv, out srcCol); - return _sources[dv].GetMetadataTypes(srcCol); - } - - public ColumnType GetMetadataTypeOrNull(string kind, int col) - { - int dv; - int srcCol; - GetColumnSource(col, out dv, out srcCol); - return _sources[dv].GetMetadataTypeOrNull(kind, srcCol); - } - - public void GetMetadata(string kind, int col, ref TValue value) - { - int dv; - int srcCol; - GetColumnSource(col, out dv, out srcCol); - _sources[dv].GetMetadata(kind, srcCol, ref value); - } - } - private sealed class Cursor : RootCursorBase, IRowCursor { private readonly IRowCursor[] _cursors; - private readonly ZipSchema _schema; + private readonly CompositeSchema _schema; private readonly bool[] _isColumnActive; public override long Batch { get { return 0; } } diff --git a/test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs b/test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs new file mode 100644 index 0000000000..c757ad3ae7 --- /dev/null +++ b/test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs @@ -0,0 +1,54 @@ +using Microsoft.ML.Runtime.RunTests; +using Microsoft.ML.TestFramework; +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.ML.Tests +{ + public class PartitionedFileLoaderTests : TestDataPipeBase + { + public PartitionedFileLoaderTests(ITestOutputHelper output) + : base(output) + { + + } + + [Fact] + public void PartitionedNamedDirectories() + { + string basePath = Path.Combine(SamplesDir, @"Partitioned\Named"); + string pathData = Path.Combine(basePath, @"...\*.csv"); + + TestCore(pathData, false, + new[] { + "loader=Part{bp=" + basePath + " loader=Text{header+ sep=comma col=L0:TX:0}}" + }); + + Done(); + } + + [Fact] + public void PartitionedUnnamedDirectories() + { + string basePath = Path.Combine(SamplesDir, @"Partitioned\Unnamed"); + string pathData = Path.Combine(basePath, @"...\*.csv"); + + TestCore(pathData, false, + new[] { + "loader=Part{parser=SmplPP{col=Month:I4:1} path+ bp=" + basePath + " loader=Text{header+ sep=comma col=L0:I4:1}}" + }); + + // Test again with global parser data type. + TestCore(pathData, false, + new[] { + "loader=Part{parser=SmplPP{type=I4 col=Month:1} path+ bp=" + basePath + " loader=Text{header+ sep=comma col=L0:I4:1}}" + }); + + Done(); + } + } +} From 67a358cccdc2e53c463074101d2e19a4622de2bc Mon Sep 17 00:00:00 2001 From: tyclintw Date: Mon, 7 May 2018 14:32:08 -0700 Subject: [PATCH 02/25] Roll back to the original DataType. --- src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs b/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs index 6a2d06baf4..6392fd5090 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs @@ -79,7 +79,7 @@ public class Arguments : IPartitionedPathParserFactory public Microsoft.ML.Runtime.Data.PartitionedFileLoader.Column[] Columns; [Argument(ArgumentType.AtMostOnce, HelpText = "Data type of each column.")] - public DataKind? Type = DataKind.Text; + public DataKind Type = DataKind.Text; public IPartitionedPathParser CreateComponent(IHostEnvironment env) => new SimplePartitionedPathParser(env, this); } @@ -106,7 +106,7 @@ public SimplePartitionedPathParser(IHostEnvironment env, Arguments args) { if (!col.Type.HasValue) { - col.Type = args.Type.HasValue ? args.Type : DataKind.Text; + col.Type = args.Type; } } } From 0bc8a2bc4937ccfeda4df026908c634dd729019a Mon Sep 17 00:00:00 2001 From: tyclintw Date: Fri, 11 May 2018 10:13:35 -0700 Subject: [PATCH 03/25] Address comments. --- src/Microsoft.ML.Core/Utilities/PathUtils.cs | 2 +- .../DataLoadSave/PartitionedFileLoader.cs | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/Microsoft.ML.Core/Utilities/PathUtils.cs b/src/Microsoft.ML.Core/Utilities/PathUtils.cs index 6919f5fd29..5ed719ba47 100644 --- a/src/Microsoft.ML.Core/Utilities/PathUtils.cs +++ b/src/Microsoft.ML.Core/Utilities/PathUtils.cs @@ -197,7 +197,7 @@ public static string MakePathRelative(string basepath, string path) public static IEnumerable SplitDirectories(string path) { var cleanPath = path.Replace(Path.AltDirectorySeparatorChar, Path.DirectorySeparatorChar); - return cleanPath.Split(Path.DirectorySeparatorChar).Where(dir => !String.IsNullOrEmpty(dir)); + return cleanPath.Split(new char[] { Path.DirectorySeparatorChar }, StringSplitOptions.RemoveEmptyEntries); } } } diff --git a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs index 5b82490d14..701967ec64 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs @@ -164,6 +164,8 @@ public bool TryUnparse(StringBuilder sb) private readonly IPartitionedPathParser _pathParser; private const string RegistrationName = LoadName; + private const string FilePathSpecName = "FilePathSpec"; + private const int FilePathColIndex = -1; public PartitionedFileLoader(IHostEnvironment env, Arguments args, IMultiStreamSource files) { @@ -187,7 +189,7 @@ public PartitionedFileLoader(IHostEnvironment env, Arguments args, IMultiStreamS var pathCol = new Column() { Name = "Path", - Source = -1, + Source = FilePathColIndex, Type = DataKind.Text }; @@ -228,7 +230,7 @@ private PartitionedFileLoader(IHost host, ModelLoadContext ctx, IMultiStreamSour var loader = SubComponent.Parse(ctx.LoadString()); _subLoader = new SubComponent(loader.Kind, loader.Settings); - ctx.LoadModel(_host, out _pathParser, "FilePathSpec"); + ctx.LoadModel(_host, out _pathParser, FilePathSpecName); _files = files; Schema = CreateSchema(_host, _columns, _subLoader); @@ -273,7 +275,7 @@ public void Save(ModelSaveContext ctx) } ctx.SaveString(_subLoader.ToString()); - ctx.SaveModel(_pathParser, "FilePathSpec"); + ctx.SaveModel(_pathParser, FilePathSpecName); } public bool CanShuffle => true; @@ -511,7 +513,7 @@ private void UpdateColumnValues(string path, List values) { _colValues[i] = new DvText(values[source]); } - else if (source == -1) + else if (source == FilePathColIndex) { _colValues[i] = new DvText(path); } @@ -679,7 +681,7 @@ private bool TryParseValuesFromPath(string path, out List results) } catch (FormatException e) { - Ch.Warning($"Could not parse column values from the path {path}.", e); + Ch.Warning($"Could not parse column values from the path {path}. Ex: {e.Message}"); results = null; return false; } From ce3edceac73b31209d65c0e39478d66b0b924d2f Mon Sep 17 00:00:00 2001 From: tyclintw Date: Fri, 11 May 2018 10:31:11 -0700 Subject: [PATCH 04/25] Add exception handling for failed loader. --- .../DataLoadSave/PartitionedFileLoader.cs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs index 701967ec64..ceaeca3fa6 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs @@ -418,8 +418,17 @@ protected override bool MoveNextCore() return false; } - // Load the sub cursor and reset the data. - var loader = _parent._subLoader.CreateInstance(_parent._host, new MultiFileSource(path)); + IDataLoader loader = null; + try + { + // Load the sub cursor and reset the data. + loader = _parent._subLoader.CreateInstance(_parent._host, new MultiFileSource(path)); + } + catch (Exception e) + { + Ch.Warning($"Failed to load file {path} due to a loader exception. Moving on to the next file. Ex: {e.Message}"); + continue; + } if (!SchemasMatch(_parent.SubSchema, loader.Schema)) { From bcd4aadcd7b2836423daa9d7e7e84fb0e41ba20f Mon Sep 17 00:00:00 2001 From: tyclintw Date: Fri, 11 May 2018 10:58:06 -0700 Subject: [PATCH 05/25] Fix Generator issues. This is change is a hack and will be addressed in a different PR. --- src/Microsoft.ML/CSharpApi.cs | 485 ++++++++++-------- .../Internal/Tools/CSharpApiGenerator.cs | 4 +- 2 files changed, 270 insertions(+), 219 deletions(-) diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs index ecea73a495..2fa4afe0e7 100644 --- a/src/Microsoft.ML/CSharpApi.cs +++ b/src/Microsoft.ML/CSharpApi.cs @@ -1547,12 +1547,12 @@ public sealed partial class BinaryCrossValidator /// /// The training subgraph inputs /// - public Models.CrossValidationBinaryMacroSubGraphInput Inputs { get; set; } = new Models.CrossValidationBinaryMacroSubGraphInput(); + public Microsoft.ML.Models.CrossValidationBinaryMacroSubGraphInput Inputs { get; set; } = new Microsoft.ML.Models.CrossValidationBinaryMacroSubGraphInput(); /// /// The training subgraph outputs /// - public Models.CrossValidationBinaryMacroSubGraphOutput Outputs { get; set; } = new Models.CrossValidationBinaryMacroSubGraphOutput(); + public Microsoft.ML.Models.CrossValidationBinaryMacroSubGraphOutput Outputs { get; set; } = new Microsoft.ML.Models.CrossValidationBinaryMacroSubGraphOutput(); /// /// Column to use for stratification @@ -1817,12 +1817,12 @@ public sealed partial class CrossValidator /// /// The training subgraph inputs /// - public Models.CrossValidationMacroSubGraphInput Inputs { get; set; } = new Models.CrossValidationMacroSubGraphInput(); + public Microsoft.ML.Models.CrossValidationMacroSubGraphInput Inputs { get; set; } = new Microsoft.ML.Models.CrossValidationMacroSubGraphInput(); /// /// The training subgraph outputs /// - public Models.CrossValidationMacroSubGraphOutput Outputs { get; set; } = new Models.CrossValidationMacroSubGraphOutput(); + public Microsoft.ML.Models.CrossValidationMacroSubGraphOutput Outputs { get; set; } = new Microsoft.ML.Models.CrossValidationMacroSubGraphOutput(); /// /// Column to use for stratification @@ -1837,7 +1837,7 @@ public sealed partial class CrossValidator /// /// Specifies the trainer kind, which determines the evaluator to be used. /// - public Models.MacroUtilsTrainerKinds Kind { get; set; } = Models.MacroUtilsTrainerKinds.SignatureBinaryClassifierTrainer; + public Microsoft.ML.Models.MacroUtilsTrainerKinds Kind { get; set; } = Microsoft.ML.Models.MacroUtilsTrainerKinds.SignatureBinaryClassifierTrainer; public sealed class Output @@ -2211,7 +2211,7 @@ public sealed partial class OneVersusAll : Microsoft.ML.Runtime.EntryPoints.Comm /// /// The training subgraph output. /// - public Models.OneVersusAllMacroSubGraphOutput OutputForSubGraph { get; set; } = new Models.OneVersusAllMacroSubGraphOutput(); + public Microsoft.ML.Models.OneVersusAllMacroSubGraphOutput OutputForSubGraph { get; set; } = new Microsoft.ML.Models.OneVersusAllMacroSubGraphOutput(); /// /// Use probabilities in OVA combiner @@ -2241,12 +2241,12 @@ public sealed partial class OneVersusAll : Microsoft.ML.Runtime.EntryPoints.Comm /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output @@ -2324,12 +2324,12 @@ public sealed partial class OvaModelCombiner : Microsoft.ML.Runtime.EntryPoints. /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output @@ -2778,12 +2778,12 @@ public sealed partial class TrainTestBinaryEvaluator /// /// The training subgraph inputs /// - public Models.TrainTestBinaryMacroSubGraphInput Inputs { get; set; } = new Models.TrainTestBinaryMacroSubGraphInput(); + public Microsoft.ML.Models.TrainTestBinaryMacroSubGraphInput Inputs { get; set; } = new Microsoft.ML.Models.TrainTestBinaryMacroSubGraphInput(); /// /// The training subgraph outputs /// - public Models.TrainTestBinaryMacroSubGraphOutput Outputs { get; set; } = new Models.TrainTestBinaryMacroSubGraphOutput(); + public Microsoft.ML.Models.TrainTestBinaryMacroSubGraphOutput Outputs { get; set; } = new Microsoft.ML.Models.TrainTestBinaryMacroSubGraphOutput(); public sealed class Output @@ -2868,17 +2868,17 @@ public sealed partial class TrainTestEvaluator /// /// The training subgraph inputs /// - public Models.TrainTestMacroSubGraphInput Inputs { get; set; } = new Models.TrainTestMacroSubGraphInput(); + public Microsoft.ML.Models.TrainTestMacroSubGraphInput Inputs { get; set; } = new Microsoft.ML.Models.TrainTestMacroSubGraphInput(); /// /// The training subgraph outputs /// - public Models.TrainTestMacroSubGraphOutput Outputs { get; set; } = new Models.TrainTestMacroSubGraphOutput(); + public Microsoft.ML.Models.TrainTestMacroSubGraphOutput Outputs { get; set; } = new Microsoft.ML.Models.TrainTestMacroSubGraphOutput(); /// /// Specifies the trainer kind, which determines the evaluator to be used. /// - public Models.MacroUtilsTrainerKinds Kind { get; set; } = Models.MacroUtilsTrainerKinds.SignatureBinaryClassifierTrainer; + public Microsoft.ML.Models.MacroUtilsTrainerKinds Kind { get; set; } = Microsoft.ML.Models.MacroUtilsTrainerKinds.SignatureBinaryClassifierTrainer; /// /// Identifies which pipeline was run for this train test. @@ -3063,12 +3063,12 @@ public sealed partial class AveragedPerceptronBinaryClassifier : Microsoft.ML.Ru /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -3208,12 +3208,12 @@ public sealed partial class BinaryLogisticRegressor : Microsoft.ML.Runtime.Entry /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -3360,7 +3360,7 @@ public sealed partial class FastForestBinaryClassifier : Microsoft.ML.Runtime.En /// /// Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. /// - public Trainers.Bundle Bundling { get; set; } = Trainers.Bundle.None; + public Microsoft.ML.Trainers.Bundle Bundling { get; set; } = Microsoft.ML.Trainers.Bundle.None; /// /// Maximum number of distinct values (bins) per feature @@ -3503,12 +3503,12 @@ public sealed partial class FastForestBinaryClassifier : Microsoft.ML.Runtime.En /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -3637,7 +3637,7 @@ public sealed partial class FastForestRegressor : Microsoft.ML.Runtime.EntryPoin /// /// Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. /// - public Trainers.Bundle Bundling { get; set; } = Trainers.Bundle.None; + public Microsoft.ML.Trainers.Bundle Bundling { get; set; } = Microsoft.ML.Trainers.Bundle.None; /// /// Maximum number of distinct values (bins) per feature @@ -3780,12 +3780,12 @@ public sealed partial class FastForestRegressor : Microsoft.ML.Runtime.EntryPoin /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -3865,7 +3865,7 @@ public sealed partial class FastTreeBinaryClassifier : Microsoft.ML.Runtime.Entr /// /// Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent) /// - public Trainers.BoostedTreeArgsOptimizationAlgorithmType OptimizationAlgorithm { get; set; } = Trainers.BoostedTreeArgsOptimizationAlgorithmType.GradientDescent; + public Microsoft.ML.Trainers.BoostedTreeArgsOptimizationAlgorithmType OptimizationAlgorithm { get; set; } = Microsoft.ML.Trainers.BoostedTreeArgsOptimizationAlgorithmType.GradientDescent; /// /// Early stopping rule. (Validation set (/valid) is required.) @@ -4030,7 +4030,7 @@ public sealed partial class FastTreeBinaryClassifier : Microsoft.ML.Runtime.Entr /// /// Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. /// - public Trainers.Bundle Bundling { get; set; } = Trainers.Bundle.None; + public Microsoft.ML.Trainers.Bundle Bundling { get; set; } = Microsoft.ML.Trainers.Bundle.None; /// /// Maximum number of distinct values (bins) per feature @@ -4173,12 +4173,12 @@ public sealed partial class FastTreeBinaryClassifier : Microsoft.ML.Runtime.Entr /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -4286,7 +4286,7 @@ public sealed partial class FastTreeRanker : Microsoft.ML.Runtime.EntryPoints.Co /// /// Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent) /// - public Trainers.BoostedTreeArgsOptimizationAlgorithmType OptimizationAlgorithm { get; set; } = Trainers.BoostedTreeArgsOptimizationAlgorithmType.GradientDescent; + public Microsoft.ML.Trainers.BoostedTreeArgsOptimizationAlgorithmType OptimizationAlgorithm { get; set; } = Microsoft.ML.Trainers.BoostedTreeArgsOptimizationAlgorithmType.GradientDescent; /// /// Early stopping rule. (Validation set (/valid) is required.) @@ -4451,7 +4451,7 @@ public sealed partial class FastTreeRanker : Microsoft.ML.Runtime.EntryPoints.Co /// /// Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. /// - public Trainers.Bundle Bundling { get; set; } = Trainers.Bundle.None; + public Microsoft.ML.Trainers.Bundle Bundling { get; set; } = Microsoft.ML.Trainers.Bundle.None; /// /// Maximum number of distinct values (bins) per feature @@ -4594,12 +4594,12 @@ public sealed partial class FastTreeRanker : Microsoft.ML.Runtime.EntryPoints.Co /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRankingOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -4667,7 +4667,7 @@ public sealed partial class FastTreeRegressor : Microsoft.ML.Runtime.EntryPoints /// /// Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent) /// - public Trainers.BoostedTreeArgsOptimizationAlgorithmType OptimizationAlgorithm { get; set; } = Trainers.BoostedTreeArgsOptimizationAlgorithmType.GradientDescent; + public Microsoft.ML.Trainers.BoostedTreeArgsOptimizationAlgorithmType OptimizationAlgorithm { get; set; } = Microsoft.ML.Trainers.BoostedTreeArgsOptimizationAlgorithmType.GradientDescent; /// /// Early stopping rule. (Validation set (/valid) is required.) @@ -4832,7 +4832,7 @@ public sealed partial class FastTreeRegressor : Microsoft.ML.Runtime.EntryPoints /// /// Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. /// - public Trainers.Bundle Bundling { get; set; } = Trainers.Bundle.None; + public Microsoft.ML.Trainers.Bundle Bundling { get; set; } = Microsoft.ML.Trainers.Bundle.None; /// /// Maximum number of distinct values (bins) per feature @@ -4975,12 +4975,12 @@ public sealed partial class FastTreeRegressor : Microsoft.ML.Runtime.EntryPoints /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -5053,7 +5053,7 @@ public sealed partial class FastTreeTweedieRegressor : Microsoft.ML.Runtime.Entr /// /// Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent) /// - public Trainers.BoostedTreeArgsOptimizationAlgorithmType OptimizationAlgorithm { get; set; } = Trainers.BoostedTreeArgsOptimizationAlgorithmType.GradientDescent; + public Microsoft.ML.Trainers.BoostedTreeArgsOptimizationAlgorithmType OptimizationAlgorithm { get; set; } = Microsoft.ML.Trainers.BoostedTreeArgsOptimizationAlgorithmType.GradientDescent; /// /// Early stopping rule. (Validation set (/valid) is required.) @@ -5218,7 +5218,7 @@ public sealed partial class FastTreeTweedieRegressor : Microsoft.ML.Runtime.Entr /// /// Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. /// - public Trainers.Bundle Bundling { get; set; } = Trainers.Bundle.None; + public Microsoft.ML.Trainers.Bundle Bundling { get; set; } = Microsoft.ML.Trainers.Bundle.None; /// /// Maximum number of distinct values (bins) per feature @@ -5361,12 +5361,12 @@ public sealed partial class FastTreeTweedieRegressor : Microsoft.ML.Runtime.Entr /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -5513,12 +5513,12 @@ public sealed partial class GeneralizedAdditiveModelBinaryClassifier : Microsoft /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -5649,12 +5649,12 @@ public sealed partial class GeneralizedAdditiveModelRegressor : Microsoft.ML.Run /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -5779,12 +5779,12 @@ public sealed partial class LinearSvmBinaryClassifier : Microsoft.ML.Runtime.Ent /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -5924,12 +5924,12 @@ public sealed partial class LogisticRegressor : Microsoft.ML.Runtime.EntryPoints /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IMulticlassClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -5992,12 +5992,12 @@ public sealed partial class NaiveBayesClassifier : Microsoft.ML.Runtime.EntryPoi /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IMulticlassClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -6142,12 +6142,12 @@ public sealed partial class OnlineGradientDescentRegressor : Microsoft.ML.Runtim /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -6226,12 +6226,12 @@ public sealed partial class OrdinaryLeastSquaresRegressor : Microsoft.ML.Runtime /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -6366,12 +6366,12 @@ public sealed partial class PoissonRegressor : Microsoft.ML.Runtime.EntryPoints. /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -6502,12 +6502,12 @@ public sealed partial class StochasticDualCoordinateAscentBinaryClassifier : Mic /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -6622,12 +6622,12 @@ public sealed partial class StochasticDualCoordinateAscentClassifier : Microsoft /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IMulticlassClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -6742,12 +6742,12 @@ public sealed partial class StochasticDualCoordinateAscentRegressor : Microsoft. /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -6876,12 +6876,12 @@ public sealed partial class StochasticGradientDescentBinaryClassifier : Microsof /// /// Normalize option for the feature column /// - public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// /// Whether learner should cache input training data /// - public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput @@ -7117,15 +7117,15 @@ public BinNormalizer(params ValueTuple[] inputOutputColumns) public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -7133,7 +7133,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.NormalizeTransformBinColumn[] Column { get; set; } + public Microsoft.ML.Transforms.NormalizeTransformBinColumn[] Column { get; set; } /// /// Max number of bins, power of 2 recommended @@ -7231,7 +7231,7 @@ public sealed class CategoricalHashTransformColumn : OneToOneColumn /// Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index) /// - public Transforms.CategoricalTransformOutputKind? OutputKind { get; set; } + public Microsoft.ML.Transforms.CategoricalTransformOutputKind? OutputKind { get; set; } /// /// Name of the new column @@ -7279,15 +7279,15 @@ public CategoricalHashOneHotVectorizer(params ValueTuple[] input public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -7295,7 +7295,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:hashBits:src) /// - public Transforms.CategoricalHashTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.CategoricalHashTransformColumn[] Column { get; set; } /// /// Number of bits to hash into. Must be between 1 and 30, inclusive. @@ -7320,7 +7320,7 @@ public void AddColumn(string name, string source) /// /// Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index) /// - public Transforms.CategoricalTransformOutputKind OutputKind { get; set; } = Transforms.CategoricalTransformOutputKind.Bag; + public Microsoft.ML.Transforms.CategoricalTransformOutputKind OutputKind { get; set; } = Microsoft.ML.Transforms.CategoricalTransformOutputKind.Bag; /// /// Input dataset @@ -7381,7 +7381,7 @@ public sealed class CategoricalTransformColumn : OneToOneColumn /// Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector /// - public Transforms.CategoricalTransformOutputKind? OutputKind { get; set; } + public Microsoft.ML.Transforms.CategoricalTransformOutputKind? OutputKind { get; set; } /// /// Maximum number of terms to keep when auto-training @@ -7396,7 +7396,7 @@ public sealed class CategoricalTransformColumn : OneToOneColumn /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). /// - public Transforms.TermTransformSortOrder? Sort { get; set; } + public Microsoft.ML.Transforms.TermTransformSortOrder? Sort { get; set; } /// /// Whether key value metadata should be text, regardless of the actual input type @@ -7449,15 +7449,15 @@ public CategoricalOneHotVectorizer(params ValueTuple[] inputOutp public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -7465,12 +7465,12 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.CategoricalTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.CategoricalTransformColumn[] Column { get; set; } /// /// Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index) /// - public Transforms.CategoricalTransformOutputKind OutputKind { get; set; } = Transforms.CategoricalTransformOutputKind.Ind; + public Microsoft.ML.Transforms.CategoricalTransformOutputKind OutputKind { get; set; } = Microsoft.ML.Transforms.CategoricalTransformOutputKind.Ind; /// /// Maximum number of terms to keep per column when auto-training @@ -7485,7 +7485,7 @@ public void AddColumn(string name, string source) /// /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). /// - public Transforms.TermTransformSortOrder Sort { get; set; } = Transforms.TermTransformSortOrder.Occurrence; + public Microsoft.ML.Transforms.TermTransformSortOrder Sort { get; set; } = Microsoft.ML.Transforms.TermTransformSortOrder.Occurrence; /// /// Whether key value metadata should be text, regardless of the actual input type @@ -7588,15 +7588,15 @@ public CharacterTokenizer(params ValueTuple[] inputOutputColumns public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -7604,7 +7604,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.CharTokenizeTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.CharTokenizeTransformColumn[] Column { get; set; } /// /// Whether to mark the beginning/end of each row/slot with start of text character (0x02)/end of text character (0x03) @@ -7690,8 +7690,8 @@ public ColumnConcatenator(string outputColumn, params string[] inputColumns) public void AddColumn(string name, params string[] source) { - var list = Column == null ? new List() : new List(Column); - list.Add(ManyToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(ManyToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -7699,7 +7699,7 @@ public void AddColumn(string name, params string[] source) /// /// New column definition(s) (optional form: name:srcs) /// - public Transforms.ConcatTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.ConcatTransformColumn[] Column { get; set; } /// /// Input dataset @@ -7797,15 +7797,15 @@ public ColumnCopier(params ValueTuple[] inputOutputColumns) public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -7813,7 +7813,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.CopyColumnsTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.CopyColumnsTransformColumn[] Column { get; set; } /// /// Input dataset @@ -8016,7 +8016,7 @@ public sealed class ConvertTransformColumn : OneToOneColumn /// The result type /// - public Transforms.DataKind? ResultType { get; set; } + public Microsoft.ML.Transforms.DataKind? ResultType { get; set; } /// /// For a key column, this defines the range of values @@ -8069,15 +8069,15 @@ public ColumnTypeConverter(params ValueTuple[] inputOutputColumn public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -8085,12 +8085,12 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:type:src) /// - public Transforms.ConvertTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.ConvertTransformColumn[] Column { get; set; } /// /// The result type /// - public Transforms.DataKind? ResultType { get; set; } + public Microsoft.ML.Transforms.DataKind? ResultType { get; set; } /// /// For a key column, this defines the range of values @@ -8268,15 +8268,15 @@ public ConditionalNormalizer(params ValueTuple[] inputOutputColu public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -8284,7 +8284,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.NormalizeTransformAffineColumn[] Column { get; set; } + public Microsoft.ML.Transforms.NormalizeTransformAffineColumn[] Column { get; set; } /// /// Whether to map zero to zero, preserving sparsity @@ -8360,7 +8360,7 @@ public sealed partial class DataCache : Microsoft.ML.Runtime.EntryPoints.CommonI /// /// Caching strategy /// - public Transforms.CacheCachingType Caching { get; set; } = Transforms.CacheCachingType.Memory; + public Microsoft.ML.Transforms.CacheCachingType Caching { get; set; } = Microsoft.ML.Transforms.CacheCachingType.Memory; /// /// Input dataset @@ -8498,7 +8498,7 @@ public sealed class TermTransformColumn : OneToOneColumn, I /// /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). /// - public Transforms.TermTransformSortOrder? Sort { get; set; } + public Microsoft.ML.Transforms.TermTransformSortOrder? Sort { get; set; } /// /// Whether key value metadata should be text, regardless of the actual input type @@ -8551,15 +8551,15 @@ public Dictionarizer(params ValueTuple[] inputOutputColumns) public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -8567,7 +8567,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.TermTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.TermTransformColumn[] Column { get; set; } /// /// Maximum number of terms to keep per column when auto-training @@ -8582,7 +8582,7 @@ public void AddColumn(string name, string source) /// /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). /// - public Transforms.TermTransformSortOrder Sort { get; set; } = Transforms.TermTransformSortOrder.Occurrence; + public Microsoft.ML.Transforms.TermTransformSortOrder Sort { get; set; } = Microsoft.ML.Transforms.TermTransformSortOrder.Occurrence; /// /// Whether key value metadata should be text, regardless of the actual input type @@ -8900,15 +8900,15 @@ public GlobalContrastNormalizer(params ValueTuple[] inputOutputC public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -8916,7 +8916,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.LpNormNormalizerTransformGcnColumn[] Column { get; set; } + public Microsoft.ML.Transforms.LpNormNormalizerTransformGcnColumn[] Column { get; set; } /// /// Subtract mean from each value before normalizing @@ -9054,15 +9054,15 @@ public HashConverter(params ValueTuple[] inputOutputColumns) public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -9070,7 +9070,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.HashJoinTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.HashJoinTransformColumn[] Column { get; set; } /// /// Whether the values need to be combined for a single hash @@ -9188,15 +9188,15 @@ public KeyToTextConverter(params ValueTuple[] inputOutputColumns public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -9204,7 +9204,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.KeyToValueTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.KeyToValueTransformColumn[] Column { get; set; } /// /// Input dataset @@ -9372,15 +9372,15 @@ public LabelIndicator(params ValueTuple[] inputOutputColumns) public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -9388,7 +9388,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.LabelIndicatorTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.LabelIndicatorTransformColumn[] Column { get; set; } /// /// Label of the positive class. @@ -9556,15 +9556,15 @@ public LogMeanVarianceNormalizer(params ValueTuple[] inputOutput public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -9577,7 +9577,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.NormalizeTransformLogNormalColumn[] Column { get; set; } + public Microsoft.ML.Transforms.NormalizeTransformLogNormalColumn[] Column { get; set; } /// /// Max number of examples used to train the normalizer @@ -9645,7 +9645,7 @@ public sealed class LpNormNormalizerTransformColumn : OneToOneColumn /// The norm to use to normalize each sample /// - public Transforms.LpNormNormalizerTransformNormalizerKind? NormKind { get; set; } + public Microsoft.ML.Transforms.LpNormNormalizerTransformNormalizerKind? NormKind { get; set; } /// /// Subtract mean from each value before normalizing @@ -9698,15 +9698,15 @@ public LpNormalizer(params ValueTuple[] inputOutputColumns) public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -9714,12 +9714,12 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.LpNormNormalizerTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.LpNormNormalizerTransformColumn[] Column { get; set; } /// /// The norm to use to normalize each sample /// - public Transforms.LpNormNormalizerTransformNormalizerKind NormKind { get; set; } = Transforms.LpNormNormalizerTransformNormalizerKind.L2Norm; + public Microsoft.ML.Transforms.LpNormNormalizerTransformNormalizerKind NormKind { get; set; } = Microsoft.ML.Transforms.LpNormNormalizerTransformNormalizerKind.L2Norm; /// /// Subtract mean from each value before normalizing @@ -9840,15 +9840,15 @@ public MeanVarianceNormalizer(params ValueTuple[] inputOutputCol public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -9861,7 +9861,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.NormalizeTransformAffineColumn[] Column { get; set; } + public Microsoft.ML.Transforms.NormalizeTransformAffineColumn[] Column { get; set; } /// /// Whether to map zero to zero, preserving sparsity @@ -9955,15 +9955,15 @@ public MinMaxNormalizer(params ValueTuple[] inputOutputColumns) public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -9971,7 +9971,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.NormalizeTransformAffineColumn[] Column { get; set; } + public Microsoft.ML.Transforms.NormalizeTransformAffineColumn[] Column { get; set; } /// /// Whether to map zero to zero, preserving sparsity @@ -10048,7 +10048,7 @@ public sealed class NAHandleTransformColumn : OneToOneColumn /// The replacement method to utilize /// - public Transforms.NAHandleTransformReplacementKind? Kind { get; set; } + public Microsoft.ML.Transforms.NAHandleTransformReplacementKind? Kind { get; set; } /// /// Whether to impute values by slot @@ -10106,15 +10106,15 @@ public MissingValueHandler(params ValueTuple[] inputOutputColumn public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -10122,12 +10122,12 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:rep:src) /// - public Transforms.NAHandleTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.NAHandleTransformColumn[] Column { get; set; } /// /// The replacement method to utilize /// - public Transforms.NAHandleTransformReplacementKind ReplaceWith { get; set; } = Transforms.NAHandleTransformReplacementKind.Def; + public Microsoft.ML.Transforms.NAHandleTransformReplacementKind ReplaceWith { get; set; } = Microsoft.ML.Transforms.NAHandleTransformReplacementKind.Def; /// /// Whether to impute values by slot @@ -10235,15 +10235,15 @@ public MissingValueIndicator(params ValueTuple[] inputOutputColu public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -10251,7 +10251,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.NAIndicatorTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.NAIndicatorTransformColumn[] Column { get; set; } /// /// Input dataset @@ -10349,15 +10349,15 @@ public MissingValuesDropper(params ValueTuple[] inputOutputColum public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -10365,7 +10365,7 @@ public void AddColumn(string name, string source) /// /// Columns to drop the NAs for /// - public Transforms.NADropTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.NADropTransformColumn[] Column { get; set; } /// /// Input dataset @@ -10505,7 +10505,7 @@ public sealed class NAReplaceTransformColumn : OneToOneColumn /// The replacement method to utilize /// - public Transforms.NAReplaceTransformReplacementKind? Kind { get; set; } + public Microsoft.ML.Transforms.NAReplaceTransformReplacementKind? Kind { get; set; } /// /// Whether to impute values by slot @@ -10558,15 +10558,15 @@ public MissingValueSubstitutor(params ValueTuple[] inputOutputCo public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -10574,12 +10574,12 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:rep:src) /// - public Transforms.NAReplaceTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.NAReplaceTransformColumn[] Column { get; set; } /// /// The replacement method to utilize /// - public Transforms.NAReplaceTransformReplacementKind ReplacementKind { get; set; } = Transforms.NAReplaceTransformReplacementKind.Def; + public Microsoft.ML.Transforms.NAReplaceTransformReplacementKind ReplacementKind { get; set; } = Microsoft.ML.Transforms.NAReplaceTransformReplacementKind.Def; /// /// Whether to impute values by slot @@ -10693,7 +10693,7 @@ public sealed class NgramTransformColumn : OneToOneColumn, /// /// Statistical measure used to evaluate how important a word is to a document in a corpus /// - public Transforms.NgramTransformWeightingCriteria? Weighting { get; set; } + public Microsoft.ML.Transforms.NgramTransformWeightingCriteria? Weighting { get; set; } /// /// Name of the new column @@ -10741,15 +10741,15 @@ public NGramTranslator(params ValueTuple[] inputOutputColumns) public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -10757,7 +10757,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.NgramTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.NgramTransformColumn[] Column { get; set; } /// /// Maximum ngram length @@ -10782,7 +10782,7 @@ public void AddColumn(string name, string source) /// /// The weighting criteria /// - public Transforms.NgramTransformWeightingCriteria Weighting { get; set; } = Transforms.NgramTransformWeightingCriteria.Tf; + public Microsoft.ML.Transforms.NgramTransformWeightingCriteria Weighting { get; set; } = Microsoft.ML.Transforms.NgramTransformWeightingCriteria.Tf; /// /// Input dataset @@ -11036,7 +11036,7 @@ public sealed partial class RandomNumberGenerator : Microsoft.ML.Runtime.EntryPo /// /// New column definition(s) (optional form: name:seed) /// - public Transforms.GenerateNumberTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.GenerateNumberTransformColumn[] Column { get; set; } /// /// Use an auto-incremented integer starting at zero instead of a random number @@ -11480,7 +11480,7 @@ public sealed partial class Segregator : Microsoft.ML.Runtime.EntryPoints.Common /// /// Specifies how to unroll multiple pivot columns of different size. /// - public Transforms.UngroupTransformUngroupMode Mode { get; set; } = Transforms.UngroupTransformUngroupMode.Inner; + public Microsoft.ML.Transforms.UngroupTransformUngroupMode Mode { get; set; } = Microsoft.ML.Transforms.UngroupTransformUngroupMode.Inner; /// /// Input dataset @@ -11629,15 +11629,15 @@ public SupervisedBinNormalizer(params ValueTuple[] inputOutputCo public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -11655,7 +11655,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.NormalizeTransformBinColumn[] Column { get; set; } + public Microsoft.ML.Transforms.NormalizeTransformBinColumn[] Column { get; set; } /// /// Max number of bins, power of 2 recommended @@ -11770,7 +11770,7 @@ public sealed class TermLoaderArguments /// /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). /// - public Transforms.TermTransformSortOrder Sort { get; set; } = Transforms.TermTransformSortOrder.Occurrence; + public Microsoft.ML.Transforms.TermTransformSortOrder Sort { get; set; } = Microsoft.ML.Transforms.TermTransformSortOrder.Occurrence; /// /// Drop unknown terms instead of mapping them to NA term. @@ -11796,19 +11796,19 @@ public TextFeaturizer(string outputColumn, params string[] inputColumns) public void AddColumn(string name, params string[] source) { - Column = ManyToOneColumn.Create(name, source); + Column = ManyToOneColumn.Create(name, source); } /// /// New column definition (optional form: name:srcs). /// - public Transforms.TextTransformColumn Column { get; set; } + public Microsoft.ML.Transforms.TextTransformColumn Column { get; set; } /// /// Dataset language or 'AutoDetect' to detect language per row. /// - public Transforms.TextTransformLanguage Language { get; set; } = Transforms.TextTransformLanguage.English; + public Microsoft.ML.Transforms.TextTransformLanguage Language { get; set; } = Microsoft.ML.Transforms.TextTransformLanguage.English; /// /// Stopwords remover. @@ -11819,7 +11819,7 @@ public void AddColumn(string name, params string[] source) /// /// Casing text using the rules of the invariant culture. /// - public Transforms.TextNormalizerTransformCaseNormalizationMode TextCase { get; set; } = Transforms.TextNormalizerTransformCaseNormalizationMode.Lower; + public Microsoft.ML.Transforms.TextNormalizerTransformCaseNormalizationMode TextCase { get; set; } = Microsoft.ML.Transforms.TextNormalizerTransformCaseNormalizationMode.Lower; /// /// Whether to keep diacritical marks or remove them. @@ -11844,7 +11844,7 @@ public void AddColumn(string name, params string[] source) /// /// A dictionary of whitelisted terms. /// - public Transforms.TermLoaderArguments Dictionary { get; set; } + public Microsoft.ML.Transforms.TermLoaderArguments Dictionary { get; set; } /// /// Ngram feature extractor to use for words (WordBag/WordHashBag). @@ -11861,7 +11861,7 @@ public void AddColumn(string name, params string[] source) /// /// Normalize vectors (rows) individually by rescaling them to unit norm. /// - public Transforms.TextTransformTextNormKind VectorNormalizer { get; set; } = Transforms.TextTransformTextNormKind.L2; + public Microsoft.ML.Transforms.TextTransformTextNormKind VectorNormalizer { get; set; } = Microsoft.ML.Transforms.TextTransformTextNormKind.L2; /// /// Input dataset @@ -11945,15 +11945,15 @@ public TextToKeyConverter(params ValueTuple[] inputOutputColumns public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -11961,7 +11961,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) (optional form: name:src) /// - public Transforms.TermTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.TermTransformColumn[] Column { get; set; } /// /// Maximum number of terms to keep per column when auto-training @@ -11976,7 +11976,7 @@ public void AddColumn(string name, string source) /// /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). /// - public Transforms.TermTransformSortOrder Sort { get; set; } = Transforms.TermTransformSortOrder.Occurrence; + public Microsoft.ML.Transforms.TermTransformSortOrder Sort { get; set; } = Microsoft.ML.Transforms.TermTransformSortOrder.Occurrence; /// /// Whether key value metadata should be text, regardless of the actual input type @@ -12228,15 +12228,15 @@ public WordTokenizer(params ValueTuple[] inputOutputColumns) public void AddColumn(string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); Column = list.ToArray(); } public void AddColumn(string name, string source) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(name, source)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); Column = list.ToArray(); } @@ -12244,7 +12244,7 @@ public void AddColumn(string name, string source) /// /// New column definition(s) /// - public Transforms.DelimitedTokenizeTransformColumn[] Column { get; set; } + public Microsoft.ML.Transforms.DelimitedTokenizeTransformColumn[] Column { get; set; } /// /// Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character. @@ -14039,6 +14039,57 @@ public sealed class SingleParallelTraining : ParallelTraining internal override string ComponentName => "Single"; } + public abstract class PartitionedPathParser : ComponentKind {} + + + + /// + /// Extract name/value pairs from Parquet formatted directory names. Example path: Year=2018/Month=12/data1.parquet + /// + public sealed class ParquetPathParserPartitionedPathParser : PartitionedPathParser + { + internal override string ComponentName => "ParquetPathParser"; + } + + + public sealed class PartitionedFileLoaderColumn + { + /// + /// Name of the column. + /// + public string Name { get; set; } + + /// + /// Data type of the column. + /// + public Microsoft.ML.Transforms.DataKind? Type { get; set; } + + /// + /// Source index of the column. + /// + public int Source { get; set; } + + } + + + /// + /// A simple parser that extracts directory names as column values. Column names are defined as arguments. + /// + public sealed class SimplePathParserPartitionedPathParser : PartitionedPathParser + { + /// + /// Column definitions used to override the Partitioned Path Parser. Expected with the format name:type:numeric-source, e.g. col=MyFeature:R4:1 + /// + public Microsoft.ML.Runtime.PartitionedFileLoaderColumn[] Columns { get; set; } + + /// + /// Data type of each column. + /// + public Microsoft.ML.Transforms.DataKind Type { get; set; } = Microsoft.ML.Transforms.DataKind.TX; + + internal override string ComponentName => "SimplePathParser"; + } + public abstract class RegressionLossFunction : ComponentKind {} diff --git a/src/Microsoft.ML/Runtime/Internal/Tools/CSharpApiGenerator.cs b/src/Microsoft.ML/Runtime/Internal/Tools/CSharpApiGenerator.cs index f1e45fa446..7d5e1007ec 100644 --- a/src/Microsoft.ML/Runtime/Internal/Tools/CSharpApiGenerator.cs +++ b/src/Microsoft.ML/Runtime/Internal/Tools/CSharpApiGenerator.cs @@ -564,7 +564,7 @@ private static string GetSymbolFromType(Dictionary typesSymbolTa Contracts.Assert(typesSymbolTable.Select(kvp => kvp.Value).All(str => string.Compare(str, name) != 0)); - return name; + return "Microsoft.ML." + name; } private void GenerateEnums(IndentingTextWriter writer, Type inputType, string currentNamespace) @@ -1079,7 +1079,7 @@ private void GenerateComponent(IndentingTextWriter writer, ModuleCatalog.Compone writer.WriteLine($"public sealed class {GeneratorUtils.GetComponentName(component)} : {component.Kind}"); writer.WriteLine("{"); writer.Indent(); - GenerateInputFields(writer, component.ArgumentType, catalog, _typesSymbolTable, "Microsoft.ML."); + GenerateInputFields(writer, component.ArgumentType, catalog, _typesSymbolTable); writer.WriteLine($"internal override string ComponentName => \"{component.Name}\";"); writer.Outdent(); writer.WriteLine("}"); From 748ffe7d53fe94aba53071b0e7bed5f03922c4fb Mon Sep 17 00:00:00 2001 From: tyclintw Date: Fri, 11 May 2018 12:47:04 -0700 Subject: [PATCH 06/25] Address comments. --- src/Microsoft.ML.Core/Utilities/PathUtils.cs | 6 ++++-- src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.Core/Utilities/PathUtils.cs b/src/Microsoft.ML.Core/Utilities/PathUtils.cs index 5ed719ba47..dc330b14bc 100644 --- a/src/Microsoft.ML.Core/Utilities/PathUtils.cs +++ b/src/Microsoft.ML.Core/Utilities/PathUtils.cs @@ -159,7 +159,7 @@ public static string MakePathRelative(string basepath, string path) if (baseUri.Scheme != uri.Scheme) { - throw new ArgumentException("Paths cannot be made relative as they are of different schemas."); + throw new ArgumentException("Paths cannot be made relative as they are of different schemes."); } string relativePath; @@ -196,8 +196,10 @@ public static string MakePathRelative(string basepath, string path) /// An enumerable list of all non-empty directories. public static IEnumerable SplitDirectories(string path) { + char [] separators = { Path.DirectorySeparatorChar }; + var cleanPath = path.Replace(Path.AltDirectorySeparatorChar, Path.DirectorySeparatorChar); - return cleanPath.Split(new char[] { Path.DirectorySeparatorChar }, StringSplitOptions.RemoveEmptyEntries); + return cleanPath.Split(separators, StringSplitOptions.RemoveEmptyEntries); } } } diff --git a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs index ceaeca3fa6..8a0862b730 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs @@ -175,7 +175,7 @@ public PartitionedFileLoader(IHostEnvironment env, Arguments args, IMultiStreamS _host.CheckValue(files, nameof(files)); _pathParser = args.PathParserFactory.CreateComponent(_host); - _host.CheckValue(_pathParser, nameof(_pathParser), "Factory failed to create a FilePathSpec"); + _host.CheckValue(_pathParser, nameof(_pathParser), "Failed to create the FilePathSpec."); _subLoader = args.Loader; _files = files; From 4549388a41d09f74484fadd21e9c6676044fae9c Mon Sep 17 00:00:00 2001 From: tyclintw Date: Fri, 11 May 2018 12:50:32 -0700 Subject: [PATCH 07/25] Remove unused namespaces. --- test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs b/test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs index c757ad3ae7..9d478a5b1f 100644 --- a/test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs +++ b/test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs @@ -1,9 +1,9 @@ -using Microsoft.ML.Runtime.RunTests; -using Microsoft.ML.TestFramework; -using System; -using System.Collections.Generic; +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Runtime.RunTests; using System.IO; -using System.Text; using Xunit; using Xunit.Abstractions; From 781a45e5bcb1e38b540ea7876b8abbffb64faab8 Mon Sep 17 00:00:00 2001 From: tyclintw Date: Tue, 15 May 2018 13:27:55 -0700 Subject: [PATCH 08/25] Move subLoader to a byteArray so we aren't recreating with args. --- src/Microsoft.ML.Data/Commands/DataCommand.cs | 14 +++++ .../DataLoadSave/PartitionedFileLoader.cs | 55 +++++++++++++------ 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/src/Microsoft.ML.Data/Commands/DataCommand.cs b/src/Microsoft.ML.Data/Commands/DataCommand.cs index a68dd2022a..083695825a 100644 --- a/src/Microsoft.ML.Data/Commands/DataCommand.cs +++ b/src/Microsoft.ML.Data/Commands/DataCommand.cs @@ -396,6 +396,20 @@ public static void SaveLoader(IDataLoader loader, IFileHandle file) Contracts.CheckParam(file.CanWrite, nameof(file), "Must be writable"); using (var stream = file.CreateWriteStream()) + { + SaveLoader(loader, stream); + } + } + + /// + /// Saves to the specified . + /// + public static void SaveLoader(IDataLoader loader, Stream stream) + { + Contracts.CheckValue(loader, nameof(loader)); + Contracts.CheckValue(stream, nameof(stream)); + Contracts.CheckParam(stream.CanWrite, nameof(stream), "Must be writable"); + using (var rep = RepositoryWriter.CreateNew(stream)) { ModelSaveContext.SaveModel(rep, loader, ModelFileUtils.DirDataLoaderModel); diff --git a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs index 8a0862b730..d4a2922c99 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs @@ -154,13 +154,11 @@ public bool TryUnparse(StringBuilder sb) private readonly IHost _host; private readonly IMultiStreamSource _files; private readonly Column[] _columns; + private readonly byte[] _subLoaderBytes; // Number of tailing directories to include. private readonly int _tailingDirCount; - // An underlying loader used on each individual loader. - private readonly SubComponent _subLoader; - private readonly IPartitionedPathParser _pathParser; private const string RegistrationName = LoadName; @@ -177,9 +175,11 @@ public PartitionedFileLoader(IHostEnvironment env, Arguments args, IMultiStreamS _pathParser = args.PathParserFactory.CreateComponent(_host); _host.CheckValue(_pathParser, nameof(_pathParser), "Failed to create the FilePathSpec."); - _subLoader = args.Loader; _files = files; + var subLoader = args.Loader.CreateInstance(_host, _files); + _subLoaderBytes = SaveLoaderToBytes(subLoader); + string relativePath = GetRelativePath(args.BasePath, files); _columns = ParseColumns(relativePath).ToArray(); _tailingDirCount = GetDirectoryCount(relativePath); @@ -196,7 +196,7 @@ public PartitionedFileLoader(IHostEnvironment env, Arguments args, IMultiStreamS _columns = _columns.Concat(new[] { pathCol }).ToArray(); } - Schema = CreateSchema(_host, _columns, _subLoader); + Schema = CreateSchema(_host, _columns, subLoader); } private PartitionedFileLoader(IHost host, ModelLoadContext ctx, IMultiStreamSource files) @@ -211,7 +211,7 @@ private PartitionedFileLoader(IHost host, ModelLoadContext ctx, IMultiStreamSour // int: number of columns // foreach column: // string: column representation - // string: subloader + // byte[]: subloader // model: file path spec _tailingDirCount = ctx.Reader.ReadInt32(); @@ -227,13 +227,13 @@ private PartitionedFileLoader(IHost host, ModelLoadContext ctx, IMultiStreamSour _columns[i] = column; } - var loader = SubComponent.Parse(ctx.LoadString()); - _subLoader = new SubComponent(loader.Kind, loader.Settings); + _subLoaderBytes = ctx.Reader.ReadByteArray(); ctx.LoadModel(_host, out _pathParser, FilePathSpecName); _files = files; - Schema = CreateSchema(_host, _columns, _subLoader); + var loader = CreateLoaderFromBytes(_subLoaderBytes, files); + Schema = CreateSchema(_host, _columns, loader); } public static PartitionedFileLoader Create(IHostEnvironment env, ModelLoadContext ctx, IMultiStreamSource files) @@ -260,7 +260,7 @@ public void Save(ModelSaveContext ctx) // int: number of columns // foreach column: // string: column representation - // string: subloader + // byte[]: subloader // model: file path spec ctx.Writer.Write(_tailingDirCount); @@ -274,7 +274,7 @@ public void Save(ModelSaveContext ctx) ctx.SaveString(sb.ToString()); } - ctx.SaveString(_subLoader.ToString()); + ctx.Writer.WriteByteArray(_subLoaderBytes); ctx.SaveModel(_pathParser, FilePathSpecName); } @@ -306,17 +306,17 @@ public IRowCursor[] GetRowCursorSet(out IRowCursorConsolidator consolidator, Fun /// /// The exception context. /// The partitioned columns. - /// The sub loader. + /// The sub loader. /// The resulting schema. - private ISchema CreateSchema(IExceptionContext ectx, Column[] cols, SubComponent subComponent) + private ISchema CreateSchema(IExceptionContext ectx, Column[] cols, IDataLoader subLoader) { Contracts.AssertValue(cols); - Contracts.AssertValue(subComponent); + Contracts.AssertValue(subLoader); var columnNameTypes = cols.Select((col) => new KeyValuePair(col.Name, PrimitiveType.FromKind(col.Type.Value))); var colSchema = new SimpleSchema(ectx, columnNameTypes.ToArray()); - SubSchema = subComponent.CreateInstance(_host, _files).Schema; + SubSchema = subLoader.Schema; if (SubSchema.ColumnCount == 0) { @@ -334,6 +334,29 @@ private ISchema CreateSchema(IExceptionContext ectx, Column[] cols, SubComponent } } + private byte [] SaveLoaderToBytes(IDataLoader loader) + { + Contracts.CheckValue(loader, nameof(loader)); + + using (var stream = new MemoryStream()) + { + LoaderUtils.SaveLoader(loader, stream); + return stream.GetBuffer(); + } + } + + private IDataLoader CreateLoaderFromBytes(byte [] loaderBytes, IMultiStreamSource files) + { + Contracts.CheckValue(loaderBytes, nameof(loaderBytes)); + Contracts.CheckValue(files, nameof(files)); + + using (var stream = new MemoryStream(loaderBytes)) + using (var rep = RepositoryReader.Open(stream, _host)) + { + return ModelFileUtils.LoadLoader(_host, rep, files, false); + } + } + private sealed class Cursor : RootCursorBase, IRowCursor { private PartitionedFileLoader _parent; @@ -422,7 +445,7 @@ protected override bool MoveNextCore() try { // Load the sub cursor and reset the data. - loader = _parent._subLoader.CreateInstance(_parent._host, new MultiFileSource(path)); + loader = _parent.CreateLoaderFromBytes(_parent._subLoaderBytes, new MultiFileSource(path)); } catch (Exception e) { From e54698a033401194237bb6b48161393aefe4238e Mon Sep 17 00:00:00 2001 From: tyclintw Date: Tue, 15 May 2018 14:24:15 -0700 Subject: [PATCH 09/25] Save and load ISchema directly instead of the Column []. --- .../DataLoadSave/PartitionedFileLoader.cs | 88 +++++++++---------- 1 file changed, 40 insertions(+), 48 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs index d4a2922c99..f90368dd73 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs @@ -11,6 +11,7 @@ using Microsoft.ML.Runtime.CommandLine; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Data.Conversion; +using Microsoft.ML.Runtime.Data.IO; using Microsoft.ML.Runtime.Internal.Utilities; using Microsoft.ML.Runtime.Model; @@ -153,7 +154,7 @@ public bool TryUnparse(StringBuilder sb) private readonly IHost _host; private readonly IMultiStreamSource _files; - private readonly Column[] _columns; + private readonly int[] _srcColumns; private readonly byte[] _subLoaderBytes; // Number of tailing directories to include. @@ -162,7 +163,8 @@ public bool TryUnparse(StringBuilder sb) private readonly IPartitionedPathParser _pathParser; private const string RegistrationName = LoadName; - private const string FilePathSpecName = "FilePathSpec"; + private const string FilePathSpecCtxName = "FilePathSpec"; + private const string SchemaCtxName = "Schema.idv"; private const int FilePathColIndex = -1; public PartitionedFileLoader(IHostEnvironment env, Arguments args, IMultiStreamSource files) @@ -181,7 +183,7 @@ public PartitionedFileLoader(IHostEnvironment env, Arguments args, IMultiStreamS _subLoaderBytes = SaveLoaderToBytes(subLoader); string relativePath = GetRelativePath(args.BasePath, files); - _columns = ParseColumns(relativePath).ToArray(); + var columns = ParseColumns(relativePath).ToArray(); _tailingDirCount = GetDirectoryCount(relativePath); if (args.IncludePathColumn) @@ -193,10 +195,11 @@ public PartitionedFileLoader(IHostEnvironment env, Arguments args, IMultiStreamS Type = DataKind.Text }; - _columns = _columns.Concat(new[] { pathCol }).ToArray(); + columns = columns.Concat(new[] { pathCol }).ToArray(); } - Schema = CreateSchema(_host, _columns, subLoader); + _srcColumns = columns.Select(c => c.Source).ToArray(); + Schema = CreateSchema(_host, columns, subLoader); } private PartitionedFileLoader(IHost host, ModelLoadContext ctx, IMultiStreamSource files) @@ -208,32 +211,28 @@ private PartitionedFileLoader(IHost host, ModelLoadContext ctx, IMultiStreamSour // ** Binary format ** // int: tailing directory count - // int: number of columns - // foreach column: - // string: column representation + // Schema of the loader + // int[]: srcColumns // byte[]: subloader // model: file path spec _tailingDirCount = ctx.Reader.ReadInt32(); - int numColumns = ctx.Reader.ReadInt32(); - _host.CheckDecode(numColumns >= 0); - - _columns = new Column[numColumns]; - for (int i = 0; i < numColumns; i++) - { - var column = Column.Parse(ctx.LoadString()); - _host.CheckDecode(column != null); - _columns[i] = column; - } + // Load the schema + byte[] buffer = null; + if (!ctx.TryLoadBinaryStream(SchemaCtxName, r => buffer = r.ReadByteArray())) + throw _host.ExceptDecode(); + BinaryLoader loader = null; + var strm = new MemoryStream(buffer, writable: false); + loader = new BinaryLoader(_host, new BinaryLoader.Arguments(), strm); + Schema = loader.Schema; + _srcColumns = ctx.Reader.ReadIntArray(); _subLoaderBytes = ctx.Reader.ReadByteArray(); - ctx.LoadModel(_host, out _pathParser, FilePathSpecName); + ctx.LoadModel(_host, out _pathParser, FilePathSpecCtxName); _files = files; - var loader = CreateLoaderFromBytes(_subLoaderBytes, files); - Schema = CreateSchema(_host, _columns, loader); } public static PartitionedFileLoader Create(IHostEnvironment env, ModelLoadContext ctx, IMultiStreamSource files) @@ -257,33 +256,34 @@ public void Save(ModelSaveContext ctx) // ** Binary format ** // int: tailing directory count - // int: number of columns - // foreach column: - // string: column representation + // Schema of the loader + // int[]: srcColumns // byte[]: subloader // model: file path spec ctx.Writer.Write(_tailingDirCount); - ctx.Writer.Write(_columns.Length); - StringBuilder sb = new StringBuilder(); - foreach (var col in _columns) + // Save the schema + var noRows = new EmptyDataView(_host, Schema); + var saverArgs = new BinarySaver.Arguments(); + saverArgs.Silent = true; + var saver = new BinarySaver(_host, saverArgs); + using (var strm = new MemoryStream()) { - sb.Clear(); - _host.Check(col.TryUnparse(sb)); - ctx.SaveString(sb.ToString()); + var allColumns = Enumerable.Range(0, Schema.ColumnCount).ToArray(); + saver.SaveData(strm, noRows, allColumns); + ctx.SaveBinaryStream(SchemaCtxName, w => w.WriteByteArray(strm.ToArray())); } + ctx.Writer.WriteIntArray(_srcColumns); ctx.Writer.WriteByteArray(_subLoaderBytes); - ctx.SaveModel(_pathParser, FilePathSpecName); + ctx.SaveModel(_pathParser, FilePathSpecCtxName); } public bool CanShuffle => true; public ISchema Schema { get; } - private ISchema SubSchema { get; set; } - public long? GetRowCount(bool lazy = true) { return null; @@ -316,9 +316,9 @@ private ISchema CreateSchema(IExceptionContext ectx, Column[] cols, IDataLoader var columnNameTypes = cols.Select((col) => new KeyValuePair(col.Name, PrimitiveType.FromKind(col.Type.Value))); var colSchema = new SimpleSchema(ectx, columnNameTypes.ToArray()); - SubSchema = subLoader.Schema; + var subSchema = subLoader.Schema; - if (SubSchema.ColumnCount == 0) + if (subSchema.ColumnCount == 0) { return colSchema; } @@ -326,7 +326,7 @@ private ISchema CreateSchema(IExceptionContext ectx, Column[] cols, IDataLoader { var schemas = new ISchema[] { - SubSchema, + subSchema, colSchema }; @@ -382,7 +382,7 @@ public Cursor(IChannelProvider provider, PartitionedFileLoader parent, IMultiStr _active = Utils.BuildArray(Schema.ColumnCount, predicate); _subActive = _active.Take(SubColumnCount).ToArray(); - _colValues = new DvText[_parent._columns.Length]; + _colValues = new DvText[Schema.ColumnCount - SubColumnCount]; _subGetters = new Delegate[SubColumnCount]; _getters = CreateGetters(); @@ -453,12 +453,6 @@ protected override bool MoveNextCore() continue; } - if (!SchemasMatch(_parent.SubSchema, loader.Schema)) - { - Ch.Warning($"Schema of file {path} does not match."); - continue; - } - _subCursor = loader.GetRowCursor(col => _subActive[col]); try @@ -527,7 +521,7 @@ private void UpdateSubGetters() { if (_subActive[i]) { - var type = _parent.SubSchema.GetColumnType(i); + var type = _subCursor.Schema.GetColumnType(i); _subGetters[i] = MarshalGetter(_subCursor.GetGetter, type.RawType, i); } } @@ -538,9 +532,7 @@ private void UpdateColumnValues(string path, List values) // Cache the column values for future Getter calls. for (int i = 0; i < _colValues.Length; i++) { - var col = _parent._columns[i]; - - var source = col.Source; + var source = _parent._srcColumns[i]; if (source >= 0 && source < values.Count) { _colValues[i] = new DvText(values[source]); @@ -623,7 +615,7 @@ private bool IsSubColumn(int col) return col < SubColumnCount; } - private int SubColumnCount => Schema.ColumnCount - _parent._columns.Length; + private int SubColumnCount => Schema.ColumnCount - _parent._srcColumns.Length; private IEnumerable CreateFileOrder(IRandom rand) { From 885ff301d87890cdff54d00a7ec567be6f4d8c32 Mon Sep 17 00:00:00 2001 From: tyclintw Date: Tue, 15 May 2018 14:28:36 -0700 Subject: [PATCH 10/25] Update help text for clarity. --- .../DataLoadSave/PartitionedFileLoader.cs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs index f90368dd73..b39c9e24ba 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs @@ -85,7 +85,7 @@ public sealed class Column [Argument(ArgumentType.AtMostOnce, HelpText = "Data type of the column.")] public DataKind? Type; - [Argument(ArgumentType.Required, HelpText = "Source index of the column.")] + [Argument(ArgumentType.Required, HelpText = "Index of the directory representing this column.")] public int Source; public static Column Parse(string str) @@ -154,7 +154,7 @@ public bool TryUnparse(StringBuilder sb) private readonly IHost _host; private readonly IMultiStreamSource _files; - private readonly int[] _srcColumns; + private readonly int[] _srcDirIndex; private readonly byte[] _subLoaderBytes; // Number of tailing directories to include. @@ -198,7 +198,7 @@ public PartitionedFileLoader(IHostEnvironment env, Arguments args, IMultiStreamS columns = columns.Concat(new[] { pathCol }).ToArray(); } - _srcColumns = columns.Select(c => c.Source).ToArray(); + _srcDirIndex = columns.Select(c => c.Source).ToArray(); Schema = CreateSchema(_host, columns, subLoader); } @@ -227,7 +227,7 @@ private PartitionedFileLoader(IHost host, ModelLoadContext ctx, IMultiStreamSour loader = new BinaryLoader(_host, new BinaryLoader.Arguments(), strm); Schema = loader.Schema; - _srcColumns = ctx.Reader.ReadIntArray(); + _srcDirIndex = ctx.Reader.ReadIntArray(); _subLoaderBytes = ctx.Reader.ReadByteArray(); ctx.LoadModel(_host, out _pathParser, FilePathSpecCtxName); @@ -274,7 +274,7 @@ public void Save(ModelSaveContext ctx) saver.SaveData(strm, noRows, allColumns); ctx.SaveBinaryStream(SchemaCtxName, w => w.WriteByteArray(strm.ToArray())); } - ctx.Writer.WriteIntArray(_srcColumns); + ctx.Writer.WriteIntArray(_srcDirIndex); ctx.Writer.WriteByteArray(_subLoaderBytes); ctx.SaveModel(_pathParser, FilePathSpecCtxName); @@ -532,7 +532,7 @@ private void UpdateColumnValues(string path, List values) // Cache the column values for future Getter calls. for (int i = 0; i < _colValues.Length; i++) { - var source = _parent._srcColumns[i]; + var source = _parent._srcDirIndex[i]; if (source >= 0 && source < values.Count) { _colValues[i] = new DvText(values[source]); @@ -615,7 +615,7 @@ private bool IsSubColumn(int col) return col < SubColumnCount; } - private int SubColumnCount => Schema.ColumnCount - _parent._srcColumns.Length; + private int SubColumnCount => Schema.ColumnCount - _parent._srcDirIndex.Length; private IEnumerable CreateFileOrder(IRandom rand) { From bbf8de845665440fbcc3633ce75f49f9fc6954c2 Mon Sep 17 00:00:00 2001 From: tyclintw Date: Tue, 15 May 2018 15:48:26 -0700 Subject: [PATCH 11/25] Fix linux test failures. --- .../Common/EntryPoints/core_manifest.json | 134 ++++++++++++++++++ .../PartitionedFileLoaderTests.cs | 8 +- 2 files changed, 138 insertions(+), 4 deletions(-) diff --git a/ZBaselines/Common/EntryPoints/core_manifest.json b/ZBaselines/Common/EntryPoints/core_manifest.json index a3778a7f7f..772fe14272 100644 --- a/ZBaselines/Common/EntryPoints/core_manifest.json +++ b/ZBaselines/Common/EntryPoints/core_manifest.json @@ -21329,6 +21329,140 @@ } ] }, + { + "Kind": "PartitionedPathParser", + "Components": [ + { + "Name": "ParquetPathParser", + "Desc": "Extract name/value pairs from Parquet formatted directory names. Example path: Year=2018/Month=12/data1.parquet", + "FriendlyName": "Parquet Partitioned Path Parser", + "Aliases": [ + "ParqPP" + ], + "Settings": [] + }, + { + "Name": "SimplePathParser", + "Desc": "A simple parser that extracts directory names as column values. Column names are defined as arguments.", + "FriendlyName": "Simple Partitioned Path Parser", + "Aliases": [ + "SmplPP" + ], + "Settings": [ + { + "Name": "Columns", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the column.", + "Required": true, + "SortOrder": 150.0, + "IsNullable": false + }, + { + "Name": "Type", + "Type": { + "Kind": "Enum", + "Values": [ + "I1", + "U1", + "I2", + "U2", + "I4", + "U4", + "I8", + "U8", + "R4", + "Num", + "R8", + "TX", + "Text", + "TXT", + "BL", + "Bool", + "TimeSpan", + "TS", + "DT", + "DateTime", + "DZ", + "DateTimeZone", + "UG", + "U16" + ] + }, + "Desc": "Data type of the column.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Source", + "Type": "Int", + "Desc": "Index of the directory representing this column.", + "Required": true, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0 + } + ] + } + }, + "Desc": "Column definitions used to override the Partitioned Path Parser. Expected with the format name:type:numeric-source, e.g. col=MyFeature:R4:1", + "Aliases": [ + "col" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Type", + "Type": { + "Kind": "Enum", + "Values": [ + "I1", + "U1", + "I2", + "U2", + "I4", + "U4", + "I8", + "U8", + "R4", + "Num", + "R8", + "TX", + "Text", + "TXT", + "BL", + "Bool", + "TimeSpan", + "TS", + "DT", + "DateTime", + "DZ", + "DateTimeZone", + "UG", + "U16" + ] + }, + "Desc": "Data type of each column.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "TX" + } + ] + } + ] + }, { "Kind": "RegressionLossFunction", "Components": [ diff --git a/test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs b/test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs index 9d478a5b1f..8ce066f18b 100644 --- a/test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs +++ b/test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs @@ -20,8 +20,8 @@ public PartitionedFileLoaderTests(ITestOutputHelper output) [Fact] public void PartitionedNamedDirectories() { - string basePath = Path.Combine(SamplesDir, @"Partitioned\Named"); - string pathData = Path.Combine(basePath, @"...\*.csv"); + string basePath = Path.Combine(SamplesDir, "Partitioned", "Named"); + string pathData = Path.Combine(basePath, "...", "*.csv"); TestCore(pathData, false, new[] { @@ -34,8 +34,8 @@ public void PartitionedNamedDirectories() [Fact] public void PartitionedUnnamedDirectories() { - string basePath = Path.Combine(SamplesDir, @"Partitioned\Unnamed"); - string pathData = Path.Combine(basePath, @"...\*.csv"); + string basePath = Path.Combine(SamplesDir, "Partitioned", "Unnamed"); + string pathData = Path.Combine(basePath, "...", "*.csv"); TestCore(pathData, false, new[] { From 225b7ee7fb9a6297ff647dd27ad3c52a3d17dd97 Mon Sep 17 00:00:00 2001 From: tyclintw Date: Wed, 16 May 2018 10:30:42 -0700 Subject: [PATCH 12/25] Force path output to be unix formatted for consistency between OS tests. --- .../PartitionedUnnamedDirectories-Data.txt | 16 ++++++++-------- .../PartitionedUnnamedDirectories-Data.txt | 16 ++++++++-------- .../DataLoadSave/PartitionedFileLoader.cs | 4 +++- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/ZBaselines/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Data.txt b/ZBaselines/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Data.txt index 103029123f..4c7f650844 100644 --- a/ZBaselines/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Data.txt +++ b/ZBaselines/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Data.txt @@ -6,11 +6,11 @@ #@ col=Path:TX:2 #@ } L0 Month Path -1 1 2017\01\data1.csv -5 1 2017\01\data2.csv -7 1 2017\01\data2.csv -0 1 2017\01\dataBadSchema.csv -0 1 2017\01\dataBadSchema.csv -22 2 2017\02\data1.csv -24 2 2017\02\data1.csv -26 2 2017\02\data1.csv +1 1 2017/01/data1.csv +5 1 2017/01/data2.csv +7 1 2017/01/data2.csv +0 1 2017/01/dataBadSchema.csv +0 1 2017/01/dataBadSchema.csv +22 2 2017/02/data1.csv +24 2 2017/02/data1.csv +26 2 2017/02/data1.csv diff --git a/ZBaselines/SingleRelease/PartitionedLoader/PartitionedUnnamedDirectories-Data.txt b/ZBaselines/SingleRelease/PartitionedLoader/PartitionedUnnamedDirectories-Data.txt index 103029123f..4c7f650844 100644 --- a/ZBaselines/SingleRelease/PartitionedLoader/PartitionedUnnamedDirectories-Data.txt +++ b/ZBaselines/SingleRelease/PartitionedLoader/PartitionedUnnamedDirectories-Data.txt @@ -6,11 +6,11 @@ #@ col=Path:TX:2 #@ } L0 Month Path -1 1 2017\01\data1.csv -5 1 2017\01\data2.csv -7 1 2017\01\data2.csv -0 1 2017\01\dataBadSchema.csv -0 1 2017\01\dataBadSchema.csv -22 2 2017\02\data1.csv -24 2 2017\02\data1.csv -26 2 2017\02\data1.csv +1 1 2017/01/data1.csv +5 1 2017/01/data2.csv +7 1 2017/01/data2.csv +0 1 2017/01/dataBadSchema.csv +0 1 2017/01/dataBadSchema.csv +22 2 2017/02/data1.csv +24 2 2017/02/data1.csv +26 2 2017/02/data1.csv diff --git a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs index b39c9e24ba..a89ab2182d 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs @@ -539,7 +539,9 @@ private void UpdateColumnValues(string path, List values) } else if (source == FilePathColIndex) { - _colValues[i] = new DvText(path); + // Force Unix path for consistency. + var cleanPath = path.Replace(@"\", @"/"); + _colValues[i] = new DvText(cleanPath); } } } From 4497c355fa5d78f97899598cda521c5e7a6f0c20 Mon Sep 17 00:00:00 2001 From: tyclintw Date: Wed, 16 May 2018 10:56:25 -0700 Subject: [PATCH 13/25] Fix ZBaselines release folder name. --- .../PartitionedNamedDirectories-Data.txt | 0 .../PartitionedNamedDirectories-Schema.txt | 0 .../PartitionedUnnamedDirectories-Data.txt | 0 .../PartitionedUnnamedDirectories-Schema.txt | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename ZBaselines/SingleRelease/{PartitionedLoader => SavePipe}/PartitionedNamedDirectories-Data.txt (100%) rename ZBaselines/SingleRelease/{PartitionedLoader => SavePipe}/PartitionedNamedDirectories-Schema.txt (100%) rename ZBaselines/SingleRelease/{PartitionedLoader => SavePipe}/PartitionedUnnamedDirectories-Data.txt (100%) rename ZBaselines/SingleRelease/{PartitionedLoader => SavePipe}/PartitionedUnnamedDirectories-Schema.txt (100%) diff --git a/ZBaselines/SingleRelease/PartitionedLoader/PartitionedNamedDirectories-Data.txt b/ZBaselines/SingleRelease/SavePipe/PartitionedNamedDirectories-Data.txt similarity index 100% rename from ZBaselines/SingleRelease/PartitionedLoader/PartitionedNamedDirectories-Data.txt rename to ZBaselines/SingleRelease/SavePipe/PartitionedNamedDirectories-Data.txt diff --git a/ZBaselines/SingleRelease/PartitionedLoader/PartitionedNamedDirectories-Schema.txt b/ZBaselines/SingleRelease/SavePipe/PartitionedNamedDirectories-Schema.txt similarity index 100% rename from ZBaselines/SingleRelease/PartitionedLoader/PartitionedNamedDirectories-Schema.txt rename to ZBaselines/SingleRelease/SavePipe/PartitionedNamedDirectories-Schema.txt diff --git a/ZBaselines/SingleRelease/PartitionedLoader/PartitionedUnnamedDirectories-Data.txt b/ZBaselines/SingleRelease/SavePipe/PartitionedUnnamedDirectories-Data.txt similarity index 100% rename from ZBaselines/SingleRelease/PartitionedLoader/PartitionedUnnamedDirectories-Data.txt rename to ZBaselines/SingleRelease/SavePipe/PartitionedUnnamedDirectories-Data.txt diff --git a/ZBaselines/SingleRelease/PartitionedLoader/PartitionedUnnamedDirectories-Schema.txt b/ZBaselines/SingleRelease/SavePipe/PartitionedUnnamedDirectories-Schema.txt similarity index 100% rename from ZBaselines/SingleRelease/PartitionedLoader/PartitionedUnnamedDirectories-Schema.txt rename to ZBaselines/SingleRelease/SavePipe/PartitionedUnnamedDirectories-Schema.txt From 1e01903e22e0f47448d792b14b3e025df9b1839c Mon Sep 17 00:00:00 2001 From: tyclintw Date: Wed, 16 May 2018 12:51:21 -0700 Subject: [PATCH 14/25] Sort file listings to guarantee "Expand" ordering across operating systems. --- src/Microsoft.ML.Data/Utilities/StreamUtils.cs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs index 45a808c55e..570e962694 100644 --- a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs +++ b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs @@ -4,6 +4,7 @@ using System.Collections.Generic; using System.IO; +using System.Linq; namespace Microsoft.ML.Runtime.Internal.Utilities { @@ -94,7 +95,7 @@ private static string[] Expand(string pattern) try { // this is actually incorrect, for 3-char extensions: *** - var files = Directory.GetFiles(dir, right); + var files = Directory.GetFiles(dir, right).OrderBy(f => f).ToArray(); if (pathEmpty) { for (int i = 0; i < files.Length; i++) @@ -104,7 +105,7 @@ private static string[] Expand(string pattern) } } matchList.AddRange(files); - var subs = Directory.GetDirectories(dir); + var subs = Directory.GetDirectories(dir).OrderBy(f => f).ToArray(); for (var i = subs.Length - 1; i >= 0; i--) dirsLeft.Push(subs[i]); } @@ -125,7 +126,7 @@ private static string[] Expand(string pattern) // watch for lack of access: try { - var files = Directory.GetFiles(path, Path.GetFileName(currentPattern)); + var files = Directory.GetFiles(path, Path.GetFileName(currentPattern)).OrderBy(f => f).ToArray(); if (pathEmpty) { for (int i = 0; i < files.Length; i++) @@ -169,6 +170,7 @@ private static string[] Expand(string pattern) } } } + return matchList.ToArray(); } #endif From 10f47b41d17e47b400c384a0b015a7f5374547ce Mon Sep 17 00:00:00 2001 From: tyclintw Date: Wed, 16 May 2018 12:51:42 -0700 Subject: [PATCH 15/25] Whitespace. --- src/Microsoft.ML.Data/Utilities/StreamUtils.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs index 570e962694..ac05684a8e 100644 --- a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs +++ b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs @@ -170,7 +170,6 @@ private static string[] Expand(string pattern) } } } - return matchList.ToArray(); } #endif From 90bedc48e2896f1317fa18b88c1d9cb24dbb9194 Mon Sep 17 00:00:00 2001 From: tyclintw Date: Tue, 22 May 2018 10:42:02 -0700 Subject: [PATCH 16/25] Move test files from Samples to test/data --- test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs | 4 ++-- .../data}/Partitioned/Named/Year=2017/Month=01/data1.csv | 0 .../data}/Partitioned/Named/Year=2017/Month=01/data2.csv | 0 .../data}/Partitioned/Named/Year=2017/Month=01/dataEmpty.csv | 0 .../data}/Partitioned/Named/Year=2017/Month=02/data1.csv | 0 .../data}/Partitioned/Named/Year=2017/TestBadDir/data1.csv | 0 {Samples => test/data}/Partitioned/Unnamed/2017/01/data1.csv | 0 {Samples => test/data}/Partitioned/Unnamed/2017/01/data2.csv | 0 .../data}/Partitioned/Unnamed/2017/01/dataBadSchema.csv | 0 {Samples => test/data}/Partitioned/Unnamed/2017/02/data1.csv | 0 10 files changed, 2 insertions(+), 2 deletions(-) rename {Samples => test/data}/Partitioned/Named/Year=2017/Month=01/data1.csv (100%) rename {Samples => test/data}/Partitioned/Named/Year=2017/Month=01/data2.csv (100%) rename {Samples => test/data}/Partitioned/Named/Year=2017/Month=01/dataEmpty.csv (100%) rename {Samples => test/data}/Partitioned/Named/Year=2017/Month=02/data1.csv (100%) rename {Samples => test/data}/Partitioned/Named/Year=2017/TestBadDir/data1.csv (100%) rename {Samples => test/data}/Partitioned/Unnamed/2017/01/data1.csv (100%) rename {Samples => test/data}/Partitioned/Unnamed/2017/01/data2.csv (100%) rename {Samples => test/data}/Partitioned/Unnamed/2017/01/dataBadSchema.csv (100%) rename {Samples => test/data}/Partitioned/Unnamed/2017/02/data1.csv (100%) diff --git a/test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs b/test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs index 8ce066f18b..4b5371a98b 100644 --- a/test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs +++ b/test/Microsoft.ML.Tests/PartitionedFileLoaderTests.cs @@ -20,7 +20,7 @@ public PartitionedFileLoaderTests(ITestOutputHelper output) [Fact] public void PartitionedNamedDirectories() { - string basePath = Path.Combine(SamplesDir, "Partitioned", "Named"); + string basePath = GetDataPath("Partitioned", "Named"); string pathData = Path.Combine(basePath, "...", "*.csv"); TestCore(pathData, false, @@ -34,7 +34,7 @@ public void PartitionedNamedDirectories() [Fact] public void PartitionedUnnamedDirectories() { - string basePath = Path.Combine(SamplesDir, "Partitioned", "Unnamed"); + string basePath = GetDataPath("Partitioned", "Unnamed"); ; string pathData = Path.Combine(basePath, "...", "*.csv"); TestCore(pathData, false, diff --git a/Samples/Partitioned/Named/Year=2017/Month=01/data1.csv b/test/data/Partitioned/Named/Year=2017/Month=01/data1.csv similarity index 100% rename from Samples/Partitioned/Named/Year=2017/Month=01/data1.csv rename to test/data/Partitioned/Named/Year=2017/Month=01/data1.csv diff --git a/Samples/Partitioned/Named/Year=2017/Month=01/data2.csv b/test/data/Partitioned/Named/Year=2017/Month=01/data2.csv similarity index 100% rename from Samples/Partitioned/Named/Year=2017/Month=01/data2.csv rename to test/data/Partitioned/Named/Year=2017/Month=01/data2.csv diff --git a/Samples/Partitioned/Named/Year=2017/Month=01/dataEmpty.csv b/test/data/Partitioned/Named/Year=2017/Month=01/dataEmpty.csv similarity index 100% rename from Samples/Partitioned/Named/Year=2017/Month=01/dataEmpty.csv rename to test/data/Partitioned/Named/Year=2017/Month=01/dataEmpty.csv diff --git a/Samples/Partitioned/Named/Year=2017/Month=02/data1.csv b/test/data/Partitioned/Named/Year=2017/Month=02/data1.csv similarity index 100% rename from Samples/Partitioned/Named/Year=2017/Month=02/data1.csv rename to test/data/Partitioned/Named/Year=2017/Month=02/data1.csv diff --git a/Samples/Partitioned/Named/Year=2017/TestBadDir/data1.csv b/test/data/Partitioned/Named/Year=2017/TestBadDir/data1.csv similarity index 100% rename from Samples/Partitioned/Named/Year=2017/TestBadDir/data1.csv rename to test/data/Partitioned/Named/Year=2017/TestBadDir/data1.csv diff --git a/Samples/Partitioned/Unnamed/2017/01/data1.csv b/test/data/Partitioned/Unnamed/2017/01/data1.csv similarity index 100% rename from Samples/Partitioned/Unnamed/2017/01/data1.csv rename to test/data/Partitioned/Unnamed/2017/01/data1.csv diff --git a/Samples/Partitioned/Unnamed/2017/01/data2.csv b/test/data/Partitioned/Unnamed/2017/01/data2.csv similarity index 100% rename from Samples/Partitioned/Unnamed/2017/01/data2.csv rename to test/data/Partitioned/Unnamed/2017/01/data2.csv diff --git a/Samples/Partitioned/Unnamed/2017/01/dataBadSchema.csv b/test/data/Partitioned/Unnamed/2017/01/dataBadSchema.csv similarity index 100% rename from Samples/Partitioned/Unnamed/2017/01/dataBadSchema.csv rename to test/data/Partitioned/Unnamed/2017/01/dataBadSchema.csv diff --git a/Samples/Partitioned/Unnamed/2017/02/data1.csv b/test/data/Partitioned/Unnamed/2017/02/data1.csv similarity index 100% rename from Samples/Partitioned/Unnamed/2017/02/data1.csv rename to test/data/Partitioned/Unnamed/2017/02/data1.csv From c0467e6af31e6b08672bdb2fd18302f494543264 Mon Sep 17 00:00:00 2001 From: tyclintw Date: Tue, 22 May 2018 14:46:23 -0700 Subject: [PATCH 17/25] Address comments. --- src/Microsoft.ML.Core/Utilities/PathUtils.cs | 66 +---------------- .../DataLoadSave/PartitionedFileLoader.cs | 7 +- .../DataLoadSave/PartitionedPathParser.cs | 5 +- src/Microsoft.ML.Data/Utilities/PathUtils.cs | 73 +++++++++++++++++++ 4 files changed, 84 insertions(+), 67 deletions(-) create mode 100644 src/Microsoft.ML.Data/Utilities/PathUtils.cs diff --git a/src/Microsoft.ML.Core/Utilities/PathUtils.cs b/src/Microsoft.ML.Core/Utilities/PathUtils.cs index dc330b14bc..74ccec30c0 100644 --- a/src/Microsoft.ML.Core/Utilities/PathUtils.cs +++ b/src/Microsoft.ML.Core/Utilities/PathUtils.cs @@ -3,9 +3,7 @@ // See the LICENSE file in the project root for more information. using System; -using System.Collections.Generic; using System.IO; -using System.Linq; using System.Threading; namespace Microsoft.ML.Runtime.Internal.Utilities @@ -69,13 +67,17 @@ public static string FindExistentFileOrNull(string fileName, string folderPrefix // 1. Search in customSearchDir. if (!string.IsNullOrWhiteSpace(customSearchDir) && TryFindFile(fileName, folderPrefix, customSearchDir, out candidate)) + { return candidate; + } // 2. Search in the path specified by the environment variable. var envDir = Environment.GetEnvironmentVariable(CustomSearchDirEnvVariable); if (!string.IsNullOrWhiteSpace(envDir) && TryFindFile(fileName, folderPrefix, envDir, out candidate)) + { return candidate; + } // 3. Search in the path specified by the assemblyForBasePath. if (assemblyForBasePath != null) @@ -141,65 +143,5 @@ public static string CreateFolderIfNotExists(string folder) return null; } - - /// - /// Make a full path realtive to a base path. - /// - /// The base path, assumed to be a directory. - /// The full path. - /// The relative path. - /// If the paths are not relative. - public static string MakePathRelative(string basepath, string path) - { - Contracts.AssertNonEmpty(basepath); - Contracts.AssertNonEmpty(path); - - Uri baseUri = new Uri(basepath); - Uri uri = new Uri(path); - - if (baseUri.Scheme != uri.Scheme) - { - throw new ArgumentException("Paths cannot be made relative as they are of different schemes."); - } - - string relativePath; - try - { - if (!baseUri.AbsoluteUri.EndsWith("/")) - { - baseUri = new Uri(baseUri.AbsoluteUri + "/"); - } - - relativePath = baseUri.MakeRelativeUri(uri).ToString(); - } - catch (ArgumentNullException e) - { - throw new ArgumentException("Paths could not be made relative.", e); - } - catch (InvalidOperationException e) - { - throw new ArgumentException("Paths could not be made relative.", e); - } - - if (uri.Scheme.Equals("file", StringComparison.InvariantCultureIgnoreCase)) - { - relativePath = relativePath.Replace(Path.AltDirectorySeparatorChar, Path.DirectorySeparatorChar); - } - - return relativePath; - } - - /// - /// Split a path string into an enumerable list of the directories. - /// - /// The path string to split. - /// An enumerable list of all non-empty directories. - public static IEnumerable SplitDirectories(string path) - { - char [] separators = { Path.DirectorySeparatorChar }; - - var cleanPath = path.Replace(Path.AltDirectorySeparatorChar, Path.DirectorySeparatorChar); - return cleanPath.Split(separators, StringSplitOptions.RemoveEmptyEntries); - } } } diff --git a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs index a89ab2182d..215fbeadab 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs @@ -12,6 +12,7 @@ using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Data.Conversion; using Microsoft.ML.Runtime.Data.IO; +using Microsoft.ML.Runtime.Data.Utilities; using Microsoft.ML.Runtime.Internal.Utilities; using Microsoft.ML.Runtime.Model; @@ -673,7 +674,7 @@ private bool TryTruncatePath(int dirCount, string path, out string truncPath) truncPath = null; // Remove directories that shouldn't be parsed. - var segments = Utils.SplitDirectories(path); + var segments = PathUtils.SplitDirectories(path); segments = segments.Skip(segments.Count() - dirCount - 1); if (segments.Count() < dirCount - 1) @@ -727,7 +728,7 @@ private string GetRelativePath(string basepath, IMultiStreamSource files) string path = files.GetPathOrNull(0); _host.CheckNonEmpty(path, nameof(path)); - var relativePath = Utils.MakePathRelative(basepath, path); + var relativePath = PathUtils.MakePathRelative(basepath, path); return relativePath; } @@ -748,7 +749,7 @@ private IEnumerable ParseColumns(string path) /// The number of directories private int GetDirectoryCount(string path) { - return Utils.SplitDirectories(path).Count() - 1; + return PathUtils.SplitDirectories(path).Count() - 1; } } } diff --git a/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs b/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs index 6392fd5090..08b17aad55 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs @@ -10,6 +10,7 @@ using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.CommandLine; using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Data.Utilities; using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.Internal.Utilities; using Microsoft.ML.Runtime.Model; @@ -185,7 +186,7 @@ public IEnumerable ParseValues(string path) { Contracts.AssertNonEmpty(path); - var dirs = Utils.SplitDirectories(path); + var dirs = PathUtils.SplitDirectories(path); return dirs.Take(dirs.Count() - 1); // Ignore last directory which is the file name. } } @@ -331,7 +332,7 @@ public bool TryParseNamesAndValues(string path, out List names, out List return false; } - var dirs = Utils.SplitDirectories(path); + var dirs = PathUtils.SplitDirectories(path); dirs = dirs.Take(dirs.Count() - 1); // Ignore last directory which is the file name. names = new List(dirs.Count()); diff --git a/src/Microsoft.ML.Data/Utilities/PathUtils.cs b/src/Microsoft.ML.Data/Utilities/PathUtils.cs new file mode 100644 index 0000000000..92db9f2eb4 --- /dev/null +++ b/src/Microsoft.ML.Data/Utilities/PathUtils.cs @@ -0,0 +1,73 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.IO; + +namespace Microsoft.ML.Runtime.Data.Utilities +{ + internal static class PathUtils + { + /// + /// Make a full path realtive to a base path. + /// + /// The base path, assumed to be a directory. + /// The full path. + /// The relative path. + /// If the paths are not relative. + internal static string MakePathRelative(string basepath, string path) + { + Contracts.AssertNonEmpty(basepath); + Contracts.AssertNonEmpty(path); + + Uri baseUri = new Uri(basepath); + Uri uri = new Uri(path); + + if (baseUri.Scheme != uri.Scheme) + { + throw new ArgumentException("Paths cannot be made relative as they are of different schemes."); + } + + string relativePath; + try + { + if (!baseUri.AbsoluteUri.EndsWith("/")) + { + baseUri = new Uri(baseUri.AbsoluteUri + "/"); + } + + relativePath = baseUri.MakeRelativeUri(uri).ToString(); + } + catch (ArgumentNullException e) + { + throw new ArgumentException("Paths could not be made relative.", e); + } + catch (InvalidOperationException e) + { + throw new ArgumentException("Paths could not be made relative.", e); + } + + if (uri.Scheme.Equals("file", StringComparison.OrdinalIgnoreCase)) + { + relativePath = relativePath.Replace(Path.AltDirectorySeparatorChar, Path.DirectorySeparatorChar); + } + + return relativePath; + } + + /// + /// Split a path string into an enumerable list of the directories. + /// + /// The path string to split. + /// An enumerable list of all non-empty directories. + internal static IEnumerable SplitDirectories(string path) + { + char[] separators = { Path.DirectorySeparatorChar }; + + var cleanPath = path.Replace(Path.AltDirectorySeparatorChar, Path.DirectorySeparatorChar); + return cleanPath.Split(separators, StringSplitOptions.RemoveEmptyEntries); + } + } +} From 674b5cbc36369379c586517610c40c3a3826f85e Mon Sep 17 00:00:00 2001 From: tyclintw Date: Wed, 23 May 2018 16:08:44 -0700 Subject: [PATCH 18/25] Modify exception handling to use Contracts instead. --- src/Microsoft.ML.Data/Utilities/PathUtils.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.ML.Data/Utilities/PathUtils.cs b/src/Microsoft.ML.Data/Utilities/PathUtils.cs index 92db9f2eb4..a56d2d4a38 100644 --- a/src/Microsoft.ML.Data/Utilities/PathUtils.cs +++ b/src/Microsoft.ML.Data/Utilities/PathUtils.cs @@ -16,7 +16,7 @@ internal static class PathUtils /// The base path, assumed to be a directory. /// The full path. /// The relative path. - /// If the paths are not relative. + /// If the paths are not relative. internal static string MakePathRelative(string basepath, string path) { Contracts.AssertNonEmpty(basepath); @@ -27,7 +27,7 @@ internal static string MakePathRelative(string basepath, string path) if (baseUri.Scheme != uri.Scheme) { - throw new ArgumentException("Paths cannot be made relative as they are of different schemes."); + throw Contracts.Except("Paths cannot be made relative as they are of different schemes."); } string relativePath; @@ -42,11 +42,11 @@ internal static string MakePathRelative(string basepath, string path) } catch (ArgumentNullException e) { - throw new ArgumentException("Paths could not be made relative.", e); + throw Contracts.Except(e, "Paths could not be made relative."); } catch (InvalidOperationException e) { - throw new ArgumentException("Paths could not be made relative.", e); + throw Contracts.Except(e, "Paths could not be made relative."); } if (uri.Scheme.Equals("file", StringComparison.OrdinalIgnoreCase)) From 5265c9052b09762f669afd25c21b60680c983c25 Mon Sep 17 00:00:00 2001 From: tyclintw Date: Wed, 23 May 2018 16:12:05 -0700 Subject: [PATCH 19/25] Add UnescapeDataString to realtive path method. --- src/Microsoft.ML.Data/Utilities/PathUtils.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/Utilities/PathUtils.cs b/src/Microsoft.ML.Data/Utilities/PathUtils.cs index a56d2d4a38..f868f1d693 100644 --- a/src/Microsoft.ML.Data/Utilities/PathUtils.cs +++ b/src/Microsoft.ML.Data/Utilities/PathUtils.cs @@ -38,7 +38,7 @@ internal static string MakePathRelative(string basepath, string path) baseUri = new Uri(baseUri.AbsoluteUri + "/"); } - relativePath = baseUri.MakeRelativeUri(uri).ToString(); + relativePath = Uri.UnescapeDataString(baseUri.MakeRelativeUri(uri).ToString()); } catch (ArgumentNullException e) { From a040b510c7635255d9d69cabf5c246b5c00e9f49 Mon Sep 17 00:00:00 2001 From: tyclintw Date: Wed, 23 May 2018 16:16:45 -0700 Subject: [PATCH 20/25] Rename PathUtils to prevent name conflicts. --- src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs | 6 +++--- src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs | 5 ++--- .../Utilities/{PathUtils.cs => PartitionedPathUtils.cs} | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) rename src/Microsoft.ML.Data/Utilities/{PathUtils.cs => PartitionedPathUtils.cs} (98%) diff --git a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs index 215fbeadab..a65f8c275f 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs @@ -674,7 +674,7 @@ private bool TryTruncatePath(int dirCount, string path, out string truncPath) truncPath = null; // Remove directories that shouldn't be parsed. - var segments = PathUtils.SplitDirectories(path); + var segments = PartitionedPathUtils.SplitDirectories(path); segments = segments.Skip(segments.Count() - dirCount - 1); if (segments.Count() < dirCount - 1) @@ -728,7 +728,7 @@ private string GetRelativePath(string basepath, IMultiStreamSource files) string path = files.GetPathOrNull(0); _host.CheckNonEmpty(path, nameof(path)); - var relativePath = PathUtils.MakePathRelative(basepath, path); + var relativePath = PartitionedPathUtils.MakePathRelative(basepath, path); return relativePath; } @@ -749,7 +749,7 @@ private IEnumerable ParseColumns(string path) /// The number of directories private int GetDirectoryCount(string path) { - return PathUtils.SplitDirectories(path).Count() - 1; + return PartitionedPathUtils.SplitDirectories(path).Count() - 1; } } } diff --git a/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs b/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs index 08b17aad55..2c88e9f29b 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs @@ -12,7 +12,6 @@ using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Data.Utilities; using Microsoft.ML.Runtime.EntryPoints; -using Microsoft.ML.Runtime.Internal.Utilities; using Microsoft.ML.Runtime.Model; [assembly: LoadableClass(SimplePartitionedPathParser.Summary, typeof(SimplePartitionedPathParser), typeof(SimplePartitionedPathParser.Arguments), typeof(PartitionedPathParser), @@ -186,7 +185,7 @@ public IEnumerable ParseValues(string path) { Contracts.AssertNonEmpty(path); - var dirs = PathUtils.SplitDirectories(path); + var dirs = PartitionedPathUtils.SplitDirectories(path); return dirs.Take(dirs.Count() - 1); // Ignore last directory which is the file name. } } @@ -332,7 +331,7 @@ public bool TryParseNamesAndValues(string path, out List names, out List return false; } - var dirs = PathUtils.SplitDirectories(path); + var dirs = PartitionedPathUtils.SplitDirectories(path); dirs = dirs.Take(dirs.Count() - 1); // Ignore last directory which is the file name. names = new List(dirs.Count()); diff --git a/src/Microsoft.ML.Data/Utilities/PathUtils.cs b/src/Microsoft.ML.Data/Utilities/PartitionedPathUtils.cs similarity index 98% rename from src/Microsoft.ML.Data/Utilities/PathUtils.cs rename to src/Microsoft.ML.Data/Utilities/PartitionedPathUtils.cs index f868f1d693..dad03526a7 100644 --- a/src/Microsoft.ML.Data/Utilities/PathUtils.cs +++ b/src/Microsoft.ML.Data/Utilities/PartitionedPathUtils.cs @@ -8,7 +8,7 @@ namespace Microsoft.ML.Runtime.Data.Utilities { - internal static class PathUtils + internal static class PartitionedPathUtils { /// /// Make a full path realtive to a base path. From fe6ca03c2ddf3325b8f313531fdce472af6cb6a3 Mon Sep 17 00:00:00 2001 From: tyclintw Date: Wed, 23 May 2018 16:20:17 -0700 Subject: [PATCH 21/25] Address comments. --- src/Microsoft.ML.Data/Utilities/PartitionedPathUtils.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/Utilities/PartitionedPathUtils.cs b/src/Microsoft.ML.Data/Utilities/PartitionedPathUtils.cs index dad03526a7..7992a70e36 100644 --- a/src/Microsoft.ML.Data/Utilities/PartitionedPathUtils.cs +++ b/src/Microsoft.ML.Data/Utilities/PartitionedPathUtils.cs @@ -27,7 +27,7 @@ internal static string MakePathRelative(string basepath, string path) if (baseUri.Scheme != uri.Scheme) { - throw Contracts.Except("Paths cannot be made relative as they are of different schemes."); + throw Contracts.ExceptParam(basepath, "Paths cannot be made relative as they are of different schemes."); } string relativePath; From 097086fafc6c8d6441674868cbdbdf165918b125 Mon Sep 17 00:00:00 2001 From: tyclintw Date: Thu, 24 May 2018 16:03:33 -0700 Subject: [PATCH 22/25] Fix ExceptParam call. --- src/Microsoft.ML.Data/Utilities/PartitionedPathUtils.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/Utilities/PartitionedPathUtils.cs b/src/Microsoft.ML.Data/Utilities/PartitionedPathUtils.cs index 7992a70e36..b13a0d5cee 100644 --- a/src/Microsoft.ML.Data/Utilities/PartitionedPathUtils.cs +++ b/src/Microsoft.ML.Data/Utilities/PartitionedPathUtils.cs @@ -27,7 +27,7 @@ internal static string MakePathRelative(string basepath, string path) if (baseUri.Scheme != uri.Scheme) { - throw Contracts.ExceptParam(basepath, "Paths cannot be made relative as they are of different schemes."); + throw Contracts.ExceptParam(nameof(basepath), "Paths cannot be made relative as they are of different schemes."); } string relativePath; From c1a589737ff45ef393f1c530ed210a56bc904859 Mon Sep 17 00:00:00 2001 From: tyclintw Date: Thu, 31 May 2018 15:49:03 -0700 Subject: [PATCH 23/25] Move ZBaselines to new test\BaselineOutput location. --- .../SingleDebug/SavePipe/PartitionedNamedDirectories-Data.txt | 0 .../SingleDebug/SavePipe/PartitionedNamedDirectories-Schema.txt | 0 .../SingleDebug/SavePipe/PartitionedUnnamedDirectories-Data.txt | 0 .../SingleDebug/SavePipe/PartitionedUnnamedDirectories-Schema.txt | 0 .../SingleRelease/SavePipe/PartitionedNamedDirectories-Data.txt | 0 .../SingleRelease/SavePipe/PartitionedNamedDirectories-Schema.txt | 0 .../SingleRelease/SavePipe/PartitionedUnnamedDirectories-Data.txt | 0 .../SavePipe/PartitionedUnnamedDirectories-Schema.txt | 0 8 files changed, 0 insertions(+), 0 deletions(-) rename {ZBaselines => test/BaselineOutput}/SingleDebug/SavePipe/PartitionedNamedDirectories-Data.txt (100%) rename {ZBaselines => test/BaselineOutput}/SingleDebug/SavePipe/PartitionedNamedDirectories-Schema.txt (100%) rename {ZBaselines => test/BaselineOutput}/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Data.txt (100%) rename {ZBaselines => test/BaselineOutput}/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Schema.txt (100%) rename {ZBaselines => test/BaselineOutput}/SingleRelease/SavePipe/PartitionedNamedDirectories-Data.txt (100%) rename {ZBaselines => test/BaselineOutput}/SingleRelease/SavePipe/PartitionedNamedDirectories-Schema.txt (100%) rename {ZBaselines => test/BaselineOutput}/SingleRelease/SavePipe/PartitionedUnnamedDirectories-Data.txt (100%) rename {ZBaselines => test/BaselineOutput}/SingleRelease/SavePipe/PartitionedUnnamedDirectories-Schema.txt (100%) diff --git a/ZBaselines/SingleDebug/SavePipe/PartitionedNamedDirectories-Data.txt b/test/BaselineOutput/SingleDebug/SavePipe/PartitionedNamedDirectories-Data.txt similarity index 100% rename from ZBaselines/SingleDebug/SavePipe/PartitionedNamedDirectories-Data.txt rename to test/BaselineOutput/SingleDebug/SavePipe/PartitionedNamedDirectories-Data.txt diff --git a/ZBaselines/SingleDebug/SavePipe/PartitionedNamedDirectories-Schema.txt b/test/BaselineOutput/SingleDebug/SavePipe/PartitionedNamedDirectories-Schema.txt similarity index 100% rename from ZBaselines/SingleDebug/SavePipe/PartitionedNamedDirectories-Schema.txt rename to test/BaselineOutput/SingleDebug/SavePipe/PartitionedNamedDirectories-Schema.txt diff --git a/ZBaselines/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Data.txt b/test/BaselineOutput/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Data.txt similarity index 100% rename from ZBaselines/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Data.txt rename to test/BaselineOutput/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Data.txt diff --git a/ZBaselines/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Schema.txt b/test/BaselineOutput/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Schema.txt similarity index 100% rename from ZBaselines/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Schema.txt rename to test/BaselineOutput/SingleDebug/SavePipe/PartitionedUnnamedDirectories-Schema.txt diff --git a/ZBaselines/SingleRelease/SavePipe/PartitionedNamedDirectories-Data.txt b/test/BaselineOutput/SingleRelease/SavePipe/PartitionedNamedDirectories-Data.txt similarity index 100% rename from ZBaselines/SingleRelease/SavePipe/PartitionedNamedDirectories-Data.txt rename to test/BaselineOutput/SingleRelease/SavePipe/PartitionedNamedDirectories-Data.txt diff --git a/ZBaselines/SingleRelease/SavePipe/PartitionedNamedDirectories-Schema.txt b/test/BaselineOutput/SingleRelease/SavePipe/PartitionedNamedDirectories-Schema.txt similarity index 100% rename from ZBaselines/SingleRelease/SavePipe/PartitionedNamedDirectories-Schema.txt rename to test/BaselineOutput/SingleRelease/SavePipe/PartitionedNamedDirectories-Schema.txt diff --git a/ZBaselines/SingleRelease/SavePipe/PartitionedUnnamedDirectories-Data.txt b/test/BaselineOutput/SingleRelease/SavePipe/PartitionedUnnamedDirectories-Data.txt similarity index 100% rename from ZBaselines/SingleRelease/SavePipe/PartitionedUnnamedDirectories-Data.txt rename to test/BaselineOutput/SingleRelease/SavePipe/PartitionedUnnamedDirectories-Data.txt diff --git a/ZBaselines/SingleRelease/SavePipe/PartitionedUnnamedDirectories-Schema.txt b/test/BaselineOutput/SingleRelease/SavePipe/PartitionedUnnamedDirectories-Schema.txt similarity index 100% rename from ZBaselines/SingleRelease/SavePipe/PartitionedUnnamedDirectories-Schema.txt rename to test/BaselineOutput/SingleRelease/SavePipe/PartitionedUnnamedDirectories-Schema.txt From d9905bb255678d48eb4aabec32ff7e7603029c6a Mon Sep 17 00:00:00 2001 From: tyclintw Date: Fri, 1 Jun 2018 15:01:50 -0700 Subject: [PATCH 24/25] address comments --- src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs index a65f8c275f..8a25ead744 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs @@ -176,7 +176,7 @@ public PartitionedFileLoader(IHostEnvironment env, Arguments args, IMultiStreamS _host.CheckValue(files, nameof(files)); _pathParser = args.PathParserFactory.CreateComponent(_host); - _host.CheckValue(_pathParser, nameof(_pathParser), "Failed to create the FilePathSpec."); + _host.CheckUserArg(_pathParser != null, nameof(args.PathParserFactory), "Failed to create the FilePathSpec."); _files = files; From ec92ecd205e817235cf9bfee95db289701e6cd46 Mon Sep 17 00:00:00 2001 From: tyclintw Date: Tue, 5 Jun 2018 10:34:07 -0700 Subject: [PATCH 25/25] Modify all Exceptions to use Contracts.Exception. --- .../DataLoadSave/PartitionedFileLoader.cs | 4 ++-- .../DataLoadSave/PartitionedPathParser.cs | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs index 8a25ead744..69eb3bbb3b 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/PartitionedFileLoader.cs @@ -461,7 +461,7 @@ protected override bool MoveNextCore() UpdateSubGetters(); UpdateColumnValues(relativePath, values); } - catch (FormatException e) + catch (InvalidOperationException e) { // Failed to load this file so skip. Ch.Warning(MessageSensitivity.Schema, e.Message); @@ -706,7 +706,7 @@ private bool TryParseValuesFromPath(string path, out List results) results = _parent._pathParser.ParseValues(path).ToList(); return true; } - catch (FormatException e) + catch (InvalidOperationException e) { Ch.Warning($"Could not parse column values from the path {path}. Ex: {e.Message}"); results = null; diff --git a/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs b/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs index 2c88e9f29b..ca3aa075ab 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/PartitionedPathParser.cs @@ -45,7 +45,7 @@ public interface IPartitionedPathParser /// /// The file path. /// The resulting column definitions. - /// Thrown when parsing fails. + /// Thrown when parsing fails. IEnumerable ParseColumns(string path); /// @@ -53,7 +53,7 @@ public interface IPartitionedPathParser /// /// The file path. /// The resulting column values. - /// Thrown when parsing fails. + /// Thrown when parsing fails. IEnumerable ParseValues(string path); } @@ -174,7 +174,7 @@ public void Save(ModelSaveContext ctx) { if (col.Source < 0 || col.Source >= values.Count()) { - throw new FormatException($"Column definition {col} is outside the bounds of path {path}."); + throw Contracts.Except($"Column definition {col} is outside the bounds of path {path}."); } } @@ -279,7 +279,7 @@ public void Save(ModelSaveContext ctx) { if (!TryParseNames(path, out List names)) { - throw new FormatException($"Failed to parse names from path {path}. Expected directory names with the format 'Name=Value'."); + throw Contracts.Except($"Failed to parse names from path {path}. Expected directory names with the format 'Name=Value'."); } _columns = new PartitionedFileLoader.Column[names.Count]; @@ -300,12 +300,12 @@ public IEnumerable ParseValues(string path) { if (!TryParseValues(path, out List values)) { - throw new FormatException($"Failed to parse names from path {path}. Expected directory names with the format 'Name=Value'."); + throw Contracts.Except($"Failed to parse names from path {path}. Expected directory names with the format 'Name=Value'."); } if (values.Count != _columns.Length) { - throw new FormatException($"The extracted value count of {values.Count} does not match the expected Column count of {_columns.Length} for path {path}"); + throw Contracts.Except($"The extracted value count of {values.Count} does not match the expected Column count of {_columns.Length} for path {path}"); } return values;