Skip to content

Commit

Permalink
Merge pull request #265 from zh-lx/feature-segment-split
Browse files Browse the repository at this point in the history
feat: add segment api
  • Loading branch information
zh-lx authored Jul 28, 2024
2 parents 0a1adcf + 2fdbd9a commit cfa0d07
Show file tree
Hide file tree
Showing 14 changed files with 819 additions and 10 deletions.
6 changes: 3 additions & 3 deletions lib/core/pinyin/handle.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import DICT1 from "@/data/dict1";
import { getCustomMultpileDict } from "@/core/custom";
import { SingleWordResult } from "../../common/type";
import type { SurnameMode } from "../../common/type";
import { acTree, TokenizationAlgorithm } from "../../common/segmentit";
import { acTree, MatchPattern, TokenizationAlgorithm } from "../../common/segmentit";
import {
Priority,
} from "@/common/constant";
Expand All @@ -34,7 +34,7 @@ export const getPinyin = (
list: SingleWordResult[],
surname: SurnameMode,
segmentit: TokenizationAlgorithm
): SingleWordResult[] => {
): { list: SingleWordResult[], matches: MatchPattern[] } => {
const matches = acTree.search(word, surname, segmentit);
let matchIndex = 0;
const zhChars = splitString(word);
Expand Down Expand Up @@ -82,7 +82,7 @@ export const getPinyin = (
i++;
}
}
return list;
return { list, matches };
};

/**
Expand Down
9 changes: 5 additions & 4 deletions lib/core/pinyin/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import {
middlewareToneSandhi,
} from "./middlewares";

interface BasicOptions {
export interface BasicOptions {
/**
* @description 返回的拼音音调类型
* @value symbol:在字母上加音调 (默认值)
Expand Down Expand Up @@ -111,6 +111,7 @@ interface AllData {
isZh: boolean;
polyphonic: string[];
inZhRange: boolean;
result: string; // 3.24.0 新增
}

interface OptionsReturnString extends BasicOptions {
Expand Down Expand Up @@ -241,11 +242,11 @@ function pinyin(
options.nonZh = "removed";
}

let list: SingleWordResult[] = Array(stringLength(word));
let _list = Array(stringLength(word));

list = getPinyin(
let { list } = getPinyin(
word,
list,
_list,
options.surname as SurnameMode,
options.segmentit as TokenizationAlgorithm
);
Expand Down
1 change: 1 addition & 0 deletions lib/core/pinyin/middlewares.ts
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ export const middlewareType = (
isZh: item.isZh,
polyphonic,
inZhRange: !!DICT1.get(item.origin),
result: item.result,
};
});
}
Expand Down
161 changes: 161 additions & 0 deletions lib/core/segment/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import { BasicOptions } from "../pinyin";
import { TokenizationAlgorithm } from "../../common/segmentit";
import { stringLength } from "@/common/utils";
import { middleWareNonZh, middlewareToneSandhi, middlewareToneType, middlewareV, validateType } from "@/core/pinyin/middlewares";
import { getPinyin } from "@/core/pinyin/handle";
import { SurnameMode } from "../../common/type";
import { middlewareOutputFormat, middlewareSegment, Output, OutputFormat } from "./middlewares";

type SegmentBaseOptions = Pick<
BasicOptions,
"toneType" | "mode" | "surname" | "nonZh" | "v" | "toneSandhi" | "segmentit"
>;

interface AllSegmentReturnOptions extends SegmentBaseOptions {
/**
* @description 以片段格式返回全部信息
*/
format?: OutputFormat.AllSegment;
}

interface AllArrayReturnOptions extends SegmentBaseOptions {
/**
* @description 以数组格式返回全部信息
*/
format?: OutputFormat.AllArray;
}

interface AllStringReturnOptions extends SegmentBaseOptions {
/**
* @description 以字符串格式返回全部信息
*/
format?: OutputFormat.AllString;
/**
* @description 分隔符,默认为空格,仅在 format 为 AllString(3)、PinyinString(6)、ZhString(9) 时生效
*/
separator?: string;
}

interface PinyinSegmentReturnOptions extends SegmentBaseOptions {
/**
* @description 以片段格式返回拼音
*/
format?: OutputFormat.PinyinSegment;
}

interface PinyinArrayReturnOptions extends SegmentBaseOptions {
/**
* @description 以数组格式返回拼音
*/
format?: OutputFormat.PinyinArray;
}

interface PinyinStringReturnOptions extends SegmentBaseOptions {
/**
* @description 以字符串格式返回拼音
*/
format?: OutputFormat.PinyinString;
/**
* @description 分隔符,默认为空格,仅在 format 为 AllString(3)、PinyinString(6)、ZhString(9) 时生效
*/
separator?: string;
}

interface ZhSegmentReturnOptions extends SegmentBaseOptions {
/**
* @description 以片段格式返回中文
*/
format?: OutputFormat.ZhSegment;
}

interface ZhArrayReturnOptions extends SegmentBaseOptions {
/**
* @description 以数组格式返回中文
*/
format?: OutputFormat.ZhArray;
}

interface ZhStringReturnOptions extends SegmentBaseOptions {
/**
* @description 以字符串格式返回中文
*/
format?: OutputFormat.ZhString;
/**
* @description 分隔符,默认为空格,仅在 format 为 AllString(3)、PinyinString(6)、ZhString(9) 时生效
*/
separator?: string;
}

export interface SegmentCompleteOptions extends SegmentBaseOptions {
format?: OutputFormat;
/**
* @description 分隔符,默认为空格,仅在 format 为 AllString(3)、PinyinString(6)、ZhString(9) 时生效
*/
separator?: string;
}

const DEFAULT_OPTIONS: SegmentCompleteOptions = {
toneType: "symbol",
mode: "normal",
nonZh: "spaced",
v: false,
separator: " ",
toneSandhi: true,
segmentit: TokenizationAlgorithm.MaxProbability,
format: OutputFormat.AllSegment,
};

export function segment(word: string, options?: AllSegmentReturnOptions): Output['AllSegment'];
export function segment(word: string, options?: AllArrayReturnOptions): Output['AllArray'];
export function segment(word: string, options?: AllStringReturnOptions): Output['AllString'];
export function segment(word: string, options?: PinyinSegmentReturnOptions): Output['PinyinSegment'];
export function segment(word: string, options?: PinyinArrayReturnOptions): Output['PinyinArray'];
export function segment(word: string, options?: PinyinStringReturnOptions): Output['PinyinString'];
export function segment(word: string, options?: ZhSegmentReturnOptions): Output['ZhSegment'];
export function segment(word: string, options?: ZhArrayReturnOptions): Output['ZhArray'];
export function segment(word: string, options?: ZhStringReturnOptions): Output['ZhString'];

export function segment(word: string, options?: SegmentCompleteOptions) {
options = { ...DEFAULT_OPTIONS, ...(options || {}) };

// 校验 word 类型是否正确
const legal = validateType(word);
if (!legal) {
return word;
}

if (options.surname === undefined) {
if (options.mode === "surname") {
options.surname = "all";
} else {
options.surname = "off";
}
}

let _list = Array(stringLength(word));

let { list, matches } = getPinyin(
word,
_list,
options.surname as SurnameMode,
options.segmentit as TokenizationAlgorithm
);

// 一和不变调处理
list = middlewareToneSandhi(list, options.toneSandhi as boolean);

// nonZh
list = middleWareNonZh(list, options);

// toneType参数处理
middlewareToneType(list, options);

// v参数处理
middlewareV(list, options);

const segments = middlewareSegment(list, matches);

return middlewareOutputFormat(segments, { format: options.format, separator: options.separator });
}

export { OutputFormat };
117 changes: 117 additions & 0 deletions lib/core/segment/middlewares.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import { splitString } from "@/common/utils";
import { MatchPattern } from "../../common/segmentit";
import { SingleWordResult } from "../../common/type";

interface OriginSegment {
segment: {
origin: string;
result: string;
}[];
isZh: boolean;
}

export enum OutputFormat {
AllSegment = 1,
AllArray = 2,
AllString = 3,
PinyinSegment = 4,
PinyinArray = 5,
PinyinString = 6,
ZhSegment = 7,
ZhArray = 8,
ZhString = 9,
}

export interface Output {
AllSegment: { origin: string, result: string, }[];
AllArray: { origin: string, result: string, }[][];
AllString: { origin: string, result: string, };
PinyinSegment: string[];
PinyinArray: string[][];
PinyinString: string;
ZhSegment: string[];
ZhArray: string[][];
ZhString: string;
}


export function middlewareSegment(list: SingleWordResult[], matches: MatchPattern[]): OriginSegment[] {
const segments: OriginSegment[] = [];

let i = 0;
let j = 0;
while (i < list.length && j < matches.length) {
const match = matches[j];
const item = list[i];

if (match.zh.startsWith(item.origin)) {
const start = i;
const chars = splitString(match.zh);
let cur = start + 1;
while (cur < list.length && list[cur].origin === chars[cur - start]) {
cur++;
}
const _segment = list.slice(start, cur);
segments.push({
segment: _segment.map((item) => ({
origin: item.origin,
result: item.result,
})),
isZh: true,
});
i += cur - start;
j++;
} else {
segments.push({
segment: [
{
origin: item.origin,
result: item.result,
}
],
isZh: false,
});
i++;
}
}

return segments;
}


export function middlewareOutputFormat(segments: OriginSegment[], options: { format?: OutputFormat, separator?: string } ) {
const { format = OutputFormat.AllSegment, separator = ' ' } = options;
if (format === OutputFormat.AllSegment) {
return segments.map(item => {
return {
origin: item.segment.map(item => item.origin).join(''),
result: item.segment.map(item => item.result).join(''),
}
});
} else if (format === OutputFormat.AllArray) {
return segments.map(item => item.segment);
} else if (format === OutputFormat.AllString) {
const list = segments.map(item => {
return {
origin: item.segment.map(item => item.origin).join(''),
result: item.segment.map(item => item.result).join(''),
}
});
return {
origin: list.map(item => item.origin).join(separator),
result: list.map(item => item.result).join(separator),
};
} else if (format === OutputFormat.PinyinSegment) {
return segments.map(item => item.segment.map(item => item.result).join(''));
} else if (format === OutputFormat.PinyinArray) {
return segments.map(item => item.segment.map(item => item.result));
} else if (format === OutputFormat.PinyinString) {
return segments.map(item => item.segment.map(item => item.result).join('')).join(separator);
} else if (format === OutputFormat.ZhSegment) {
return segments.map(item => item.segment.map(item => item.origin).join(''));
} else if (format === OutputFormat.ZhArray) {
return segments.map(item => item.segment.map(item => item.origin));
} else if (format === OutputFormat.ZhString) {
return segments.map(item => item.segment.map(item => item.origin).join('')).join(separator);
};
}
1 change: 1 addition & 0 deletions lib/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ export { match } from './core/match';
export { html } from './core/html';
export { polyphonic } from './core/polyphonic';
export { convert } from './core/convert';
export { segment, OutputFormat } from './core/segment';
Loading

0 comments on commit cfa0d07

Please sign in to comment.