From 5333e8746a667c0ede292a5917f7ca767aaa9bae Mon Sep 17 00:00:00 2001 From: jimichan Date: Thu, 8 Jul 2021 00:22:18 +0800 Subject: [PATCH] 4.0.0 --- README.adoc | 2 +- build.gradle.kts | 5 +- .../nlp/module/pinyin/PinyinResult.java | 54 +++++++++++++++---- .../collector/CustomDictFillSubword.java | 3 ++ .../java/com/mayabot/nlp/pinyin/PinyinTest.kt | 9 ++++ .../com/mayabot/nlp/segment/CustomDictTest.kt | 10 ++-- 6 files changed, 64 insertions(+), 19 deletions(-) diff --git a/README.adoc b/README.adoc index ea39439b..0148e84f 100644 --- a/README.adoc +++ b/README.adoc @@ -1,5 +1,5 @@ = Mynlp: 高性能、可扩展的中文NLP工具包 -:version: 3.3.0 +:version: 4.0.0 :icons: font image:https://img.shields.io/github/license/mayabot/mynlp.svg[] diff --git a/build.gradle.kts b/build.gradle.kts index a050d177..1066723e 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -8,7 +8,7 @@ plugins { description = "mynlp是mayabot开源的中文自然语言处理工具集" -val buildVersion = "4.0.0-beta16" +val buildVersion = "4.0.0" //val buildVersion = "4.0.0-local" val snapShot = false @@ -154,9 +154,8 @@ subprojects { quiet() charset("UTF-8") } - - } + tasks.withType{ options.encoding = "UTF-8" options.compilerArgs = options.compilerArgs + listOf("-Xdoclint:none", "-Xlint:none", "-nowarn") diff --git a/mynlp/src/main/java/com/mayabot/nlp/module/pinyin/PinyinResult.java b/mynlp/src/main/java/com/mayabot/nlp/module/pinyin/PinyinResult.java index d351ed37..3f0fd613 100644 --- a/mynlp/src/main/java/com/mayabot/nlp/module/pinyin/PinyinResult.java +++ b/mynlp/src/main/java/com/mayabot/nlp/module/pinyin/PinyinResult.java @@ -23,7 +23,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.regex.Matcher; import java.util.regex.Pattern; /** @@ -98,6 +97,39 @@ public String asString() { private static Map fuzzyMap = fmap(); + private static Map fuzzyPinyinMap = fzMap(); + + public static Map fzMap() { + Map map = new HashMap<>(); + + for (Pinyin value : Pinyin.values()) { + String sm = value.getShengmu().toString(); + String ym = value.getYunmu().toString(); + + if (sm.equals("zh")) { + sm = "z"; + } else if (sm.equals("ch")) { + sm = "c"; + } else if (sm.equals("sh")) { + sm = "s"; + } + + if (ym.equals("eng")) { + ym = "en"; + } else if (ym.equals("ang")) { + ym = "an"; + } else if (ym.equals("ing")) { + ym = "in"; + } else if (ym.equals("iang")) { + ym = "ian"; + } else if (ym.equals("uang")) { + ym = "uan"; + } + map.put(value.getPinyinWithoutTone(), sm + ym); + } + return map; + } + private static Map fmap() { HashMap map = new HashMap<>(); map.put("zh", "z"); @@ -112,6 +144,7 @@ private static Map fmap() { } + public List asList() { List list = Lists.newArrayListWithCapacity(pinyinList.size()); int i = 0; @@ -138,16 +171,17 @@ public List asList() { } else { String withoutTone = pinyin.getPinyinWithoutTone(); - if (fuzzy) { - Matcher matcher = pattern.matcher(withoutTone); - StringBuffer sb = new StringBuffer(); - if (matcher.find()) { - String part = matcher.group(); - matcher.appendReplacement(sb, fuzzyMap.get(part)); - } - matcher.appendTail(sb); - list.add(sb.toString()); + + list.add(fuzzyPinyinMap.getOrDefault(withoutTone, withoutTone)); +// Matcher matcher = pattern.matcher(withoutTone); +// StringBuffer sb = new StringBuffer(); +// if (matcher.find()) { +// String part = matcher.group(); +// matcher.appendReplacement(sb, fuzzyMap.get(part)); +// } +// matcher.appendTail(sb); +// list.add(sb.toString()); } else { list.add(withoutTone); } diff --git a/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/CustomDictFillSubword.java b/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/CustomDictFillSubword.java index c6df63af..f443ed03 100644 --- a/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/CustomDictFillSubword.java +++ b/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/CustomDictFillSubword.java @@ -22,6 +22,9 @@ public CustomDictFillSubword(CustomDictionary dictionary) { @Override public void fill(@NotNull Wordnet wordnet, @NotNull Wordpath wordPath) { DoubleArrayTrieStringIntMap trie = dictionary.getTrie(); + if (trie == null) { + return; + } char[] text = wordnet.getCharArray(); DoubleArrayTrieStringIntMap.DATMapMatcherInt searcher = trie.match(text, 0); diff --git a/mynlp/src/test/java/com/mayabot/nlp/pinyin/PinyinTest.kt b/mynlp/src/test/java/com/mayabot/nlp/pinyin/PinyinTest.kt index 70ee73b0..a0747b2e 100644 --- a/mynlp/src/test/java/com/mayabot/nlp/pinyin/PinyinTest.kt +++ b/mynlp/src/test/java/com/mayabot/nlp/pinyin/PinyinTest.kt @@ -1,6 +1,7 @@ package com.mayabot.nlp.pinyin import com.mayabot.nlp.Mynlp +import com.mayabot.nlp.Mynlp.Companion.instance import org.junit.Assert import org.junit.Test @@ -11,5 +12,13 @@ class PinyinTest { Assert.assertEquals("[zhao, zhao, mu, mu]", "朝朝暮暮".py()) } + @Test + fun test2() { + println( + instance().convertPinyin("转战") + .fuzzy(true).asList() + ) + } + private fun String.py() = Mynlp.instance().convertPinyin(this).asList().toString() } \ No newline at end of file diff --git a/mynlp/src/test/java/com/mayabot/nlp/segment/CustomDictTest.kt b/mynlp/src/test/java/com/mayabot/nlp/segment/CustomDictTest.kt index 16cf106c..b6d10ab7 100644 --- a/mynlp/src/test/java/com/mayabot/nlp/segment/CustomDictTest.kt +++ b/mynlp/src/test/java/com/mayabot/nlp/segment/CustomDictTest.kt @@ -14,6 +14,11 @@ class CustomDictTest { mem.addWord("固收"); mem.rebuild() + mem.clear() + + mem.addWord("固收"); + mem.rebuild() + val lexer = Lexers.coreBuilder() .withCustomDictionary(mem) .collector() @@ -21,11 +26,6 @@ class CustomDictTest { .fillSubwordCustomDict(mem) .done() .build() - println("-----") - for (wordTerm in lexer.scan("长江1号")) { - println(wordTerm) - } - println("-----") println(lexer.scan("ECS固收")) println("----")