Skip to content

Commit

Permalink
切分子词预留一个可以注入的判断入口
Browse files Browse the repository at this point in the history
  • Loading branch information
jimichan committed Dec 2, 2020
1 parent 42f5a10 commit 7449cfd
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 11 deletions.
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ apply plugin: 'eclipse'

description = "mynlp是mayabot开源的中文自然语言处理工具集"

def buildVersion = "3.3.0-BETA3"
def buildVersion = "3.3.0-BETA6"
def snapShot = false

allprojects {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ open class FluentLexerBuilder(val mynlp: Mynlp = Mynlp.instance()) : LexerBuilde
}

@JvmOverloads
fun smartPickup(block: (x: WordTermCollector.PickUpSubword) -> Unit
fun smartPickup(block: (x: SmartPickUpSubword) -> Unit
= { _ -> Unit }
): CollectorBlock {
val p = SmartPickUpSubword(mynlp)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import java.util.ArrayList;
import java.util.List;
import java.util.function.Function;

/**
* Nlp收集方式,不处理子词
Expand All @@ -29,13 +30,19 @@ public class SmartPickUpSubword implements WordTermCollector.PickUpSubword {
private CoreDictionary coreDictionary;
private BiGramTableDictionary biGramTableDictionary;

/**
* 外部程序控制是否进一步拆分.返回true表示不再拆分
*/
private Function<String, Boolean> blackListCallback;

public SmartPickUpSubword(@NotNull Mynlp mynlp) {
// this.mynlp = mynlp;
algorithm = mynlp.getInstance(AtomWordViterbiBestPathAlgorithm.class);
coreDictionary = mynlp.getInstance(CoreDictionary.class);
biGramTableDictionary = mynlp.getInstance(BiGramTableDictionary.class);
}


/**
* 拆分结果保存到term中去
*
Expand All @@ -46,7 +53,7 @@ public SmartPickUpSubword(@NotNull Mynlp mynlp) {
@Override
public void pickup(WordTerm term, Wordnet wordnet, Wordpath wordPath) {

//三个字的不拆
//2个字的不拆
//3.3.0版本开始变成2个字不拆。但是三字是否切分,需要看是否存在bigram搭配(要求严格点)
if (term.length() <= 2) {
return;
Expand All @@ -70,13 +77,19 @@ public void pickup(WordTerm term, Wordnet wordnet, Wordpath wordPath) {
WordTerm x = new WordTerm(v.realWord(), v.nature, v.getRowNum());
len += v.length;
subList.add(x);
System.out.println(v.wordID);
}

// [省 政府]/n
// 如果是3字词,切分为两片。那么要求在biGramTableDictionary中包含一个pair

if (len == term.length()) {

if (blackListCallback != null) {
if (blackListCallback.apply(term.word)) {
return;
}
}

if (len == 3 && subList.size() == 2) {
if (this.biGramTableDictionary.getBiFrequency(list.get(0).wordID, list.get(1).wordID) > 0) {
term.setSubword(subList);
Expand All @@ -90,5 +103,14 @@ public void pickup(WordTerm term, Wordnet wordnet, Wordpath wordPath) {
}
}


/**
* 外部程序控制是否进一步拆分.返回true表示不再拆分
*
* @param blackListCallback
* @return
*/
public SmartPickUpSubword setBlackListCallback(Function<String, Boolean> blackListCallback) {
this.blackListCallback = blackListCallback;
return this;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,19 @@ public Wordpath process(Wordpath wordPath) {

for (int i = 0; i < vertices.size(); i++) {
Vertex vertex = vertices.get(i);
Nature nr = posList.get(i);

Nature nature = posList.get(i);
// 一个普通的词汇(不是由人名识别构造而成的),被判断为人名,而且长度大于3
// case is 基础设施/nr
if (Nature.nr.equals(nature) && vertex.nature == null && vertex.length > 3) {
vertex.nature = Nature.n;
continue;
}
//人名识别,的优先级不能高于词性分析器。
if (vertex.nature == null
|| vertex.nature == Nature.newWord ||
vertex.nature == Nature.nr) {
vertex.nature = nr;
vertex.nature = nature;
}
}

Expand Down
27 changes: 23 additions & 4 deletions mynlp/src/test/java/com/mayabot/nlp/segment/Test.kt
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,35 @@ fun main() {
val mynlp = Mynlp.instance()
val mem = MemCorrectionDictionary()

mem.addRule("安徽省/政府")
mem.rebuild()
// mem.addRule("安徽省/政府")
// mem.rebuild()

val lexer = mynlp.lexerBuilder()
.bigram()
.withPos()
.withPersonName()
.collector().smartPickup().done()
.collector().smartPickup {
it.setBlackListCallback {
it[0] == '' && it[it.length - 1] == ''
}
}
.done()
// .withCorrection(mem)
.build()

println(lexer.scan("安徽省政府网站居住证办理身份证办理"))
lexer.scan("副市长 副省长").forEach {
print(it)
println("\t has sub " + it.hasSubword())
}


//default core
// val lexer2 = Lexers.coreBuilder()
// .withPersonName()
//// .withPos()
// .collector().smartPickup()
// .done()
// .build()
//
// println(lexer2.scan("基础设施"))
}

0 comments on commit 7449cfd

Please sign in to comment.