-
Notifications
You must be signed in to change notification settings - Fork 354
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #265 from zh-lx/feature-segment-split
feat: add segment api
- Loading branch information
Showing
14 changed files
with
819 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
import { BasicOptions } from "../pinyin"; | ||
import { TokenizationAlgorithm } from "../../common/segmentit"; | ||
import { stringLength } from "@/common/utils"; | ||
import { middleWareNonZh, middlewareToneSandhi, middlewareToneType, middlewareV, validateType } from "@/core/pinyin/middlewares"; | ||
import { getPinyin } from "@/core/pinyin/handle"; | ||
import { SurnameMode } from "../../common/type"; | ||
import { middlewareOutputFormat, middlewareSegment, Output, OutputFormat } from "./middlewares"; | ||
|
||
type SegmentBaseOptions = Pick< | ||
BasicOptions, | ||
"toneType" | "mode" | "surname" | "nonZh" | "v" | "toneSandhi" | "segmentit" | ||
>; | ||
|
||
interface AllSegmentReturnOptions extends SegmentBaseOptions { | ||
/** | ||
* @description 以片段格式返回全部信息 | ||
*/ | ||
format?: OutputFormat.AllSegment; | ||
} | ||
|
||
interface AllArrayReturnOptions extends SegmentBaseOptions { | ||
/** | ||
* @description 以数组格式返回全部信息 | ||
*/ | ||
format?: OutputFormat.AllArray; | ||
} | ||
|
||
interface AllStringReturnOptions extends SegmentBaseOptions { | ||
/** | ||
* @description 以字符串格式返回全部信息 | ||
*/ | ||
format?: OutputFormat.AllString; | ||
/** | ||
* @description 分隔符,默认为空格,仅在 format 为 AllString(3)、PinyinString(6)、ZhString(9) 时生效 | ||
*/ | ||
separator?: string; | ||
} | ||
|
||
interface PinyinSegmentReturnOptions extends SegmentBaseOptions { | ||
/** | ||
* @description 以片段格式返回拼音 | ||
*/ | ||
format?: OutputFormat.PinyinSegment; | ||
} | ||
|
||
interface PinyinArrayReturnOptions extends SegmentBaseOptions { | ||
/** | ||
* @description 以数组格式返回拼音 | ||
*/ | ||
format?: OutputFormat.PinyinArray; | ||
} | ||
|
||
interface PinyinStringReturnOptions extends SegmentBaseOptions { | ||
/** | ||
* @description 以字符串格式返回拼音 | ||
*/ | ||
format?: OutputFormat.PinyinString; | ||
/** | ||
* @description 分隔符,默认为空格,仅在 format 为 AllString(3)、PinyinString(6)、ZhString(9) 时生效 | ||
*/ | ||
separator?: string; | ||
} | ||
|
||
interface ZhSegmentReturnOptions extends SegmentBaseOptions { | ||
/** | ||
* @description 以片段格式返回中文 | ||
*/ | ||
format?: OutputFormat.ZhSegment; | ||
} | ||
|
||
interface ZhArrayReturnOptions extends SegmentBaseOptions { | ||
/** | ||
* @description 以数组格式返回中文 | ||
*/ | ||
format?: OutputFormat.ZhArray; | ||
} | ||
|
||
interface ZhStringReturnOptions extends SegmentBaseOptions { | ||
/** | ||
* @description 以字符串格式返回中文 | ||
*/ | ||
format?: OutputFormat.ZhString; | ||
/** | ||
* @description 分隔符,默认为空格,仅在 format 为 AllString(3)、PinyinString(6)、ZhString(9) 时生效 | ||
*/ | ||
separator?: string; | ||
} | ||
|
||
export interface SegmentCompleteOptions extends SegmentBaseOptions { | ||
format?: OutputFormat; | ||
/** | ||
* @description 分隔符,默认为空格,仅在 format 为 AllString(3)、PinyinString(6)、ZhString(9) 时生效 | ||
*/ | ||
separator?: string; | ||
} | ||
|
||
const DEFAULT_OPTIONS: SegmentCompleteOptions = { | ||
toneType: "symbol", | ||
mode: "normal", | ||
nonZh: "spaced", | ||
v: false, | ||
separator: " ", | ||
toneSandhi: true, | ||
segmentit: TokenizationAlgorithm.MaxProbability, | ||
format: OutputFormat.AllSegment, | ||
}; | ||
|
||
export function segment(word: string, options?: AllSegmentReturnOptions): Output['AllSegment']; | ||
export function segment(word: string, options?: AllArrayReturnOptions): Output['AllArray']; | ||
export function segment(word: string, options?: AllStringReturnOptions): Output['AllString']; | ||
export function segment(word: string, options?: PinyinSegmentReturnOptions): Output['PinyinSegment']; | ||
export function segment(word: string, options?: PinyinArrayReturnOptions): Output['PinyinArray']; | ||
export function segment(word: string, options?: PinyinStringReturnOptions): Output['PinyinString']; | ||
export function segment(word: string, options?: ZhSegmentReturnOptions): Output['ZhSegment']; | ||
export function segment(word: string, options?: ZhArrayReturnOptions): Output['ZhArray']; | ||
export function segment(word: string, options?: ZhStringReturnOptions): Output['ZhString']; | ||
|
||
export function segment(word: string, options?: SegmentCompleteOptions) { | ||
options = { ...DEFAULT_OPTIONS, ...(options || {}) }; | ||
|
||
// 校验 word 类型是否正确 | ||
const legal = validateType(word); | ||
if (!legal) { | ||
return word; | ||
} | ||
|
||
if (options.surname === undefined) { | ||
if (options.mode === "surname") { | ||
options.surname = "all"; | ||
} else { | ||
options.surname = "off"; | ||
} | ||
} | ||
|
||
let _list = Array(stringLength(word)); | ||
|
||
let { list, matches } = getPinyin( | ||
word, | ||
_list, | ||
options.surname as SurnameMode, | ||
options.segmentit as TokenizationAlgorithm | ||
); | ||
|
||
// 一和不变调处理 | ||
list = middlewareToneSandhi(list, options.toneSandhi as boolean); | ||
|
||
// nonZh | ||
list = middleWareNonZh(list, options); | ||
|
||
// toneType参数处理 | ||
middlewareToneType(list, options); | ||
|
||
// v参数处理 | ||
middlewareV(list, options); | ||
|
||
const segments = middlewareSegment(list, matches); | ||
|
||
return middlewareOutputFormat(segments, { format: options.format, separator: options.separator }); | ||
} | ||
|
||
export { OutputFormat }; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
import { splitString } from "@/common/utils"; | ||
import { MatchPattern } from "../../common/segmentit"; | ||
import { SingleWordResult } from "../../common/type"; | ||
|
||
interface OriginSegment { | ||
segment: { | ||
origin: string; | ||
result: string; | ||
}[]; | ||
isZh: boolean; | ||
} | ||
|
||
export enum OutputFormat { | ||
AllSegment = 1, | ||
AllArray = 2, | ||
AllString = 3, | ||
PinyinSegment = 4, | ||
PinyinArray = 5, | ||
PinyinString = 6, | ||
ZhSegment = 7, | ||
ZhArray = 8, | ||
ZhString = 9, | ||
} | ||
|
||
export interface Output { | ||
AllSegment: { origin: string, result: string, }[]; | ||
AllArray: { origin: string, result: string, }[][]; | ||
AllString: { origin: string, result: string, }; | ||
PinyinSegment: string[]; | ||
PinyinArray: string[][]; | ||
PinyinString: string; | ||
ZhSegment: string[]; | ||
ZhArray: string[][]; | ||
ZhString: string; | ||
} | ||
|
||
|
||
export function middlewareSegment(list: SingleWordResult[], matches: MatchPattern[]): OriginSegment[] { | ||
const segments: OriginSegment[] = []; | ||
|
||
let i = 0; | ||
let j = 0; | ||
while (i < list.length && j < matches.length) { | ||
const match = matches[j]; | ||
const item = list[i]; | ||
|
||
if (match.zh.startsWith(item.origin)) { | ||
const start = i; | ||
const chars = splitString(match.zh); | ||
let cur = start + 1; | ||
while (cur < list.length && list[cur].origin === chars[cur - start]) { | ||
cur++; | ||
} | ||
const _segment = list.slice(start, cur); | ||
segments.push({ | ||
segment: _segment.map((item) => ({ | ||
origin: item.origin, | ||
result: item.result, | ||
})), | ||
isZh: true, | ||
}); | ||
i += cur - start; | ||
j++; | ||
} else { | ||
segments.push({ | ||
segment: [ | ||
{ | ||
origin: item.origin, | ||
result: item.result, | ||
} | ||
], | ||
isZh: false, | ||
}); | ||
i++; | ||
} | ||
} | ||
|
||
return segments; | ||
} | ||
|
||
|
||
export function middlewareOutputFormat(segments: OriginSegment[], options: { format?: OutputFormat, separator?: string } ) { | ||
const { format = OutputFormat.AllSegment, separator = ' ' } = options; | ||
if (format === OutputFormat.AllSegment) { | ||
return segments.map(item => { | ||
return { | ||
origin: item.segment.map(item => item.origin).join(''), | ||
result: item.segment.map(item => item.result).join(''), | ||
} | ||
}); | ||
} else if (format === OutputFormat.AllArray) { | ||
return segments.map(item => item.segment); | ||
} else if (format === OutputFormat.AllString) { | ||
const list = segments.map(item => { | ||
return { | ||
origin: item.segment.map(item => item.origin).join(''), | ||
result: item.segment.map(item => item.result).join(''), | ||
} | ||
}); | ||
return { | ||
origin: list.map(item => item.origin).join(separator), | ||
result: list.map(item => item.result).join(separator), | ||
}; | ||
} else if (format === OutputFormat.PinyinSegment) { | ||
return segments.map(item => item.segment.map(item => item.result).join('')); | ||
} else if (format === OutputFormat.PinyinArray) { | ||
return segments.map(item => item.segment.map(item => item.result)); | ||
} else if (format === OutputFormat.PinyinString) { | ||
return segments.map(item => item.segment.map(item => item.result).join('')).join(separator); | ||
} else if (format === OutputFormat.ZhSegment) { | ||
return segments.map(item => item.segment.map(item => item.origin).join('')); | ||
} else if (format === OutputFormat.ZhArray) { | ||
return segments.map(item => item.segment.map(item => item.origin)); | ||
} else if (format === OutputFormat.ZhString) { | ||
return segments.map(item => item.segment.map(item => item.origin).join('')).join(separator); | ||
}; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.