From 78d72d6ab33830d41bc1bc1daca3903d23c9a4a4 Mon Sep 17 00:00:00 2001 From: "helgeu@urholm.no" Date: Mon, 4 Jun 2018 18:44:46 +0200 Subject: [PATCH 1/3] Went through: https://raw.githubusercontent.com/ecrmnn/norwegian-stop-words/master/dist/stop-words.txt https://raw.githubusercontent.com/crodas/TextRank/master/lib/TextRank/Stopword/norwegian-stopwords.txt https://raw.githubusercontent.com/Alir3z4/stop-words/master/norwegian.txt https://raw.githubusercontent.com/helgeu/machinelearning/master/src/Microsoft.ML.Transforms/Text/StopWords/Norwegian_Bokmal.txt http://snowball.tartarus.org/algorithms/norwegian/stop.txt to collect and correct the Norwegian stopwords. --- .../Text/StopWords/Norwegian_Bokmal.txt | Bin 1392 -> 2000 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/StopWords/Norwegian_Bokmal.txt b/src/Microsoft.ML.Transforms/Text/StopWords/Norwegian_Bokmal.txt index 21ab890d7850dcdf127d5b8f5e9815cdcf010462..00fc30a26f1f1f1d6cf982efea1970ad82dbf202 100644 GIT binary patch literal 2000 zcmZ9NTdvzM3`F%^po_dgUQS{=uoDA!!R{LJ)t1M@k=(*SG^EJkLzMF8_g8tucPc;2 zkMdQ9ax7Dorg8|}g-*M#T$gP)?}Df*%_kbW+o+ShZ~so&?a zrthcvg2)lhOSx^V^u#jzmd2ctOY&Tkm-V=v3|>g8OGq6DE8lDI4e|XiD~fkdk(Joo zoon>nyi0g%mwsv<_PqsnTd&O8U0+!_Utk>+zfdj?a{w_ zdEc^vTX<=*?=|8*GEhzTTCk+oTiD05r3HML)A*Aa2u(;QE{w-a->pUK1>a)r}& zMz8o)m*?>Ax9$=FX~+)n`q+lv>rI}sLRQ@Z5O-EJBJGj7fYzr0b~ba|`A={wHZ$1p zJFx#Z=0Kh;@O#JLKF!-~aPDG5uTv#*>k$&SvvIHKu}0>~9SfbQY+)4QKgst_8nN@g zh83@bUVBN(fPCE{UJsqRkSvUqJ@1P6^kVycmDFrFcT1W)y)?uYa+OHU)^oR%Uy+L$ Pz|N+|AA|Q9)B4FF9R1t5<@COIuNE#tk+~tVMv^KS9LNUqtIkMMv2Kmj0uxZG47kZ zfk|re6(*g@0?ZMU%a}!&vl$X6?`IC3EXSfZIfg}tF>mr_7GuWD$?U9Kf#h9Q@yT3l hT$3%>OeVLnsZ8F+mND6aU1D+;yV>L&>=G;>TLH=ZB{Tp4 From 109c3e6d1f1615bbbb761b2a1dd698df6665e1c0 Mon Sep 17 00:00:00 2001 From: "helgeu@urholm.no" Date: Mon, 4 Jun 2018 19:05:46 +0200 Subject: [PATCH 2/3] 2 more words nynorsk --- .../Text/StopWords/Norwegian_Bokmal.txt | Bin 2000 -> 1974 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/StopWords/Norwegian_Bokmal.txt b/src/Microsoft.ML.Transforms/Text/StopWords/Norwegian_Bokmal.txt index 00fc30a26f1f1f1d6cf982efea1970ad82dbf202..57c93978abd805f0b6abccb98b1cf207c05da63c 100644 GIT binary patch delta 11 Scmcb>zm0!G0Q=-D_A~$+Km-;5 delta 31 lcmdnSe}R8P0K0H1LlHwN11|#?Lm7}RVkluqo$Se;3jlz_2Z#Uw From fb1f0de199a69f088737879f084f79b4f82f0bd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Helge=20Ren=C3=A9=20Urholm?= Date: Thu, 14 Jun 2018 08:10:44 +0200 Subject: [PATCH 3/3] Reverted to using two files for stopwords, and made changes according to @TomFinley --- .../Microsoft.ML.Transforms.csproj | 1 + .../Text/StopWordsRemoverTransform.cs | 14 +++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Microsoft.ML.Transforms.csproj b/src/Microsoft.ML.Transforms/Microsoft.ML.Transforms.csproj index 8aa272922c..4aebfb6b91 100644 --- a/src/Microsoft.ML.Transforms/Microsoft.ML.Transforms.csproj +++ b/src/Microsoft.ML.Transforms/Microsoft.ML.Transforms.csproj @@ -4,6 +4,7 @@ netstandard2.0 Microsoft.ML CORECLR + ..\..\bin\obj\AnyCPU.Debug\Microsoft.ML.Transforms diff --git a/src/Microsoft.ML.Transforms/Text/StopWordsRemoverTransform.cs b/src/Microsoft.ML.Transforms/Text/StopWordsRemoverTransform.cs index 96bf44b4c2..4ce69cf7db 100644 --- a/src/Microsoft.ML.Transforms/Text/StopWordsRemoverTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/StopWordsRemoverTransform.cs @@ -90,7 +90,10 @@ public enum Language Polish = 12, Czech = 13, Arabic = 14, - Japanese = 15 + Japanese = 15, + + [HideEnumValue] + Norwegian_Bokmal_v1 = 256 } public sealed class Column : OneToOneColumn @@ -198,6 +201,11 @@ public ColInfoEx(ModelLoadContext ctx, ISchema input) // int: the id of languages column name Lang = (Language)ctx.Reader.ReadInt32(); Contracts.CheckDecode(Enum.IsDefined(typeof(Language), Lang)); + if(Lang == Language.Norwegian_Bokmal + && ctx.Header.ModelVerWritten == 0x00010001) + { + Lang = Language.Norwegian_Bokmal_v1; + } _langsColName = ctx.LoadStringOrNull(); if (_langsColName != null) { @@ -229,8 +237,8 @@ private static VersionInfo GetVersionInfo() { return new VersionInfo( modelSignature: "STOPWRDR", - verWrittenCur: 0x00010001, // Initial - verReadableCur: 0x00010001, + verWrittenCur: 0x00010002, // Initial + verReadableCur: 0x00010002, verWeCanReadBack: 0x00010001, loaderSignature: LoaderSignature); }