From 7449cfd999ecbf377c568075e28ddc11bfd677a7 Mon Sep 17 00:00:00 2001 From: jimichan Date: Wed, 2 Dec 2020 10:44:01 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=87=E5=88=86=E5=AD=90=E8=AF=8D=E9=A2=84?= =?UTF-8?q?=E7=95=99=E4=B8=80=E4=B8=AA=E5=8F=AF=E4=BB=A5=E6=B3=A8=E5=85=A5?= =?UTF-8?q?=E7=9A=84=E5=88=A4=E6=96=AD=E5=85=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build.gradle | 2 +- .../mayabot/nlp/segment/FluentLexerBuilder.kt | 2 +- .../plugins/collector/SmartPickUpSubword.java | 28 +++++++++++++++++-- .../plugins/pos/PosPerceptronProcessor.java | 10 +++++-- .../test/java/com/mayabot/nlp/segment/Test.kt | 27 +++++++++++++++--- 5 files changed, 58 insertions(+), 11 deletions(-) diff --git a/build.gradle b/build.gradle index 219d1936..07463fd5 100644 --- a/build.gradle +++ b/build.gradle @@ -7,7 +7,7 @@ apply plugin: 'eclipse' description = "mynlp是mayabot开源的中文自然语言处理工具集" -def buildVersion = "3.3.0-BETA3" +def buildVersion = "3.3.0-BETA6" def snapShot = false allprojects { diff --git a/mynlp/src/main/java/com/mayabot/nlp/segment/FluentLexerBuilder.kt b/mynlp/src/main/java/com/mayabot/nlp/segment/FluentLexerBuilder.kt index e056f77e..652c3ded 100644 --- a/mynlp/src/main/java/com/mayabot/nlp/segment/FluentLexerBuilder.kt +++ b/mynlp/src/main/java/com/mayabot/nlp/segment/FluentLexerBuilder.kt @@ -133,7 +133,7 @@ open class FluentLexerBuilder(val mynlp: Mynlp = Mynlp.instance()) : LexerBuilde } @JvmOverloads - fun smartPickup(block: (x: WordTermCollector.PickUpSubword) -> Unit + fun smartPickup(block: (x: SmartPickUpSubword) -> Unit = { _ -> Unit } ): CollectorBlock { val p = SmartPickUpSubword(mynlp) diff --git a/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/SmartPickUpSubword.java b/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/SmartPickUpSubword.java index d86dbdaa..ac710000 100644 --- a/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/SmartPickUpSubword.java +++ b/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/SmartPickUpSubword.java @@ -13,6 +13,7 @@ import java.util.ArrayList; import java.util.List; +import java.util.function.Function; /** * Nlp收集方式,不处理子词 @@ -29,6 +30,11 @@ public class SmartPickUpSubword implements WordTermCollector.PickUpSubword { private CoreDictionary coreDictionary; private BiGramTableDictionary biGramTableDictionary; + /** + * 外部程序控制是否进一步拆分.返回true表示不再拆分 + */ + private Function blackListCallback; + public SmartPickUpSubword(@NotNull Mynlp mynlp) { // this.mynlp = mynlp; algorithm = mynlp.getInstance(AtomWordViterbiBestPathAlgorithm.class); @@ -36,6 +42,7 @@ public SmartPickUpSubword(@NotNull Mynlp mynlp) { biGramTableDictionary = mynlp.getInstance(BiGramTableDictionary.class); } + /** * 拆分结果保存到term中去 * @@ -46,7 +53,7 @@ public SmartPickUpSubword(@NotNull Mynlp mynlp) { @Override public void pickup(WordTerm term, Wordnet wordnet, Wordpath wordPath) { - //三个字的不拆 + //2个字的不拆 //3.3.0版本开始变成2个字不拆。但是三字是否切分,需要看是否存在bigram搭配(要求严格点) if (term.length() <= 2) { return; @@ -70,13 +77,19 @@ public void pickup(WordTerm term, Wordnet wordnet, Wordpath wordPath) { WordTerm x = new WordTerm(v.realWord(), v.nature, v.getRowNum()); len += v.length; subList.add(x); - System.out.println(v.wordID); } // [省 政府]/n // 如果是3字词,切分为两片。那么要求在biGramTableDictionary中包含一个pair if (len == term.length()) { + + if (blackListCallback != null) { + if (blackListCallback.apply(term.word)) { + return; + } + } + if (len == 3 && subList.size() == 2) { if (this.biGramTableDictionary.getBiFrequency(list.get(0).wordID, list.get(1).wordID) > 0) { term.setSubword(subList); @@ -90,5 +103,14 @@ public void pickup(WordTerm term, Wordnet wordnet, Wordpath wordPath) { } } - + /** + * 外部程序控制是否进一步拆分.返回true表示不再拆分 + * + * @param blackListCallback + * @return + */ + public SmartPickUpSubword setBlackListCallback(Function blackListCallback) { + this.blackListCallback = blackListCallback; + return this; + } } diff --git a/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/pos/PosPerceptronProcessor.java b/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/pos/PosPerceptronProcessor.java index 25473151..f83bd85a 100644 --- a/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/pos/PosPerceptronProcessor.java +++ b/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/pos/PosPerceptronProcessor.java @@ -51,13 +51,19 @@ public Wordpath process(Wordpath wordPath) { for (int i = 0; i < vertices.size(); i++) { Vertex vertex = vertices.get(i); - Nature nr = posList.get(i); + Nature nature = posList.get(i); + // 一个普通的词汇(不是由人名识别构造而成的),被判断为人名,而且长度大于3 + // case is 基础设施/nr + if (Nature.nr.equals(nature) && vertex.nature == null && vertex.length > 3) { + vertex.nature = Nature.n; + continue; + } //人名识别,的优先级不能高于词性分析器。 if (vertex.nature == null || vertex.nature == Nature.newWord || vertex.nature == Nature.nr) { - vertex.nature = nr; + vertex.nature = nature; } } diff --git a/mynlp/src/test/java/com/mayabot/nlp/segment/Test.kt b/mynlp/src/test/java/com/mayabot/nlp/segment/Test.kt index 18e8c009..53c80e62 100644 --- a/mynlp/src/test/java/com/mayabot/nlp/segment/Test.kt +++ b/mynlp/src/test/java/com/mayabot/nlp/segment/Test.kt @@ -7,16 +7,35 @@ fun main() { val mynlp = Mynlp.instance() val mem = MemCorrectionDictionary() - mem.addRule("安徽省/政府") - mem.rebuild() +// mem.addRule("安徽省/政府") +// mem.rebuild() val lexer = mynlp.lexerBuilder() .bigram() .withPos() .withPersonName() - .collector().smartPickup().done() + .collector().smartPickup { + it.setBlackListCallback { + it[0] == '副' && it[it.length - 1] == '长' + } + } + .done() // .withCorrection(mem) .build() - println(lexer.scan("安徽省政府网站居住证办理身份证办理")) + lexer.scan("副市长 副省长").forEach { + print(it) + println("\t has sub " + it.hasSubword()) + } + + + //default core +// val lexer2 = Lexers.coreBuilder() +// .withPersonName() +//// .withPos() +// .collector().smartPickup() +// .done() +// .build() +// +// println(lexer2.scan("基础设施")) } \ No newline at end of file