Merge pull request #243 from zh-lx/feature-2-unicode

Feature 2 unicode
This commit is contained in:
zhoulixiang
2024-06-09 06:58:49 +08:00
committed by GitHub
19 changed files with 17812 additions and 5215 deletions

View File

@@ -1,6 +1,7 @@
export const DoubleUnicodePrefixReg = /^[\uD800-\uDBFF]$/;
export const DoubleUnicodeSuffixReg = /^[\uDC00-\uDFFF]$/;
export const DoubleUnicodeReg = /[\uD800-\uDBFF][\uDC00-\uDFFF]/g;
export const DoubleUnicodeCharReg = /^[\uD800-\uDBFF][\uDC00-\uDFFF]$/g;
export const enum Probability {
Unknown = 1e-13,
Rule = 1e-12,

View File

@@ -9,6 +9,7 @@ import { minTokenization } from "./min-tokenization";
import { reverseMaxMatch } from "./reverse-max-match";
import { Priority } from "@/common/constant";
import type { SurnameMode } from "../type";
import { splitString, stringLength } from "../utils";
export const enum TokenizationAlgorithm {
ReverseMaxMatch = 1,
@@ -69,12 +70,12 @@ export class AC {
// 构建 trie 树
buildTrie(patternList: Pattern[]) {
for (let pattern of patternList) {
const { zh } = pattern;
const zhChars = splitString(pattern.zh);
let cur = this.root;
for (let i = 0; i < zh.length; i++) {
let c = zh.charAt(i);
for (let i = 0; i < zhChars.length; i++) {
let c = zhChars[i];
if (!cur.children.has(c)) {
const trieNode = new TrieNode(cur, zh.slice(0, i), c);
const trieNode = new TrieNode(cur, zhChars.slice(0, i).join(''), c);
cur.children.set(c, trieNode);
this.addNodeToQueues(trieNode);
}
@@ -161,8 +162,9 @@ export class AC {
match(text: string, surname: SurnameMode) {
let cur = this.root;
let result: MatchPattern[] = [];
for (let i = 0; i < text.length; i++) {
let c = text.charAt(i);
const zhChars = splitString(text);
for (let i = 0; i < zhChars.length; i++) {
let c = zhChars[i];
while (cur !== null && !cur.children.has(c)) {
cur = cur.fail as TrieNode;
@@ -220,9 +222,9 @@ export class AC {
if (algorithm === TokenizationAlgorithm.ReverseMaxMatch) {
return reverseMaxMatch(patterns);
} else if (algorithm === TokenizationAlgorithm.MinTokenization) {
return minTokenization(patterns, text.length);
return minTokenization(patterns, stringLength(text));
}
return maxProbability(patterns, text.length);
return maxProbability(patterns, stringLength(text));
}
}

View File

@@ -1,7 +1,11 @@
import { DoubleUnicodePrefixReg, DoubleUnicodeSuffixReg, DoubleUnicodeReg } from './constant';
import {
DoubleUnicodePrefixReg,
DoubleUnicodeSuffixReg,
DoubleUnicodeReg,
} from "./constant";
export function stringLength(text: string) {
return text.replace(DoubleUnicodeReg, '_').length;
return text.replace(DoubleUnicodeReg, "_").length;
}
// 双音节字符处理
@@ -10,7 +14,10 @@ export function splitString(text: string): string[] {
let i = 0;
while (i < text.length) {
const char = text.charAt(i);
if (DoubleUnicodePrefixReg.test(char) && DoubleUnicodeSuffixReg.test(text.charAt(i + 1))) {
if (
DoubleUnicodePrefixReg.test(char) &&
DoubleUnicodeSuffixReg.test(text.charAt(i + 1))
) {
result.push(text.substring(i, i + 2));
i += 2;
} else {
@@ -21,10 +28,35 @@ export function splitString(text: string): string[] {
return result;
}
export function isZhChar(char: string) {
if (typeof char !== 'string') {
return false;
export class FastDictFactory {
NumberDICT: string[];
StringDICT: Map<string, string>;
constructor() {
this.NumberDICT = [];
this.StringDICT = new Map();
}
let code = char.charCodeAt(0);
return code >= 19968 && code <= 40869;
}
get(word: string): string {
if (word.length > 1) {
return this.StringDICT.get(word) as string;
} else {
const code = word.charCodeAt(0);
return this.NumberDICT[code];
}
}
set(word: string, pinyin: string) {
if (word.length > 1) {
this.StringDICT.set(word, pinyin);
} else {
const code = word.charCodeAt(0);
this.NumberDICT[code] = pinyin;
}
}
clear() {
this.NumberDICT = [];
this.StringDICT.clear();
}
}

View File

@@ -1,10 +1,10 @@
import { acTree } from '@/common/segmentit';
import { Probability, Priority } from '@/common/constant';
import { splitString, stringLength } from '@/common/utils';
import { FastDictFactory, splitString, stringLength } from '@/common/utils';
import DICT1 from '@/data/dict1';
let customDict: { [key: string]: string } = {};
let customMultipleDict: string[] = [];
let customPolyphonicDict: string[] = [];
const customMultipleDict = new FastDictFactory();
const customPolyphonicDict = new FastDictFactory();
type CustomHandleType = 'add' | 'replace';
@@ -29,20 +29,20 @@ const CustomDictName = Symbol('custom');
* @param {CustomPinyinOptions} options multiple/polyphonic 对于 customPinyin 补充词汇的处理
*/
export function customPinyin(
config: { [key: string]: string } = {},
config: { [word: string]: string } = {},
options?: CustomPinyinOptions
) {
const keys = Object.keys(config).sort(
(key1, key2) => stringLength(key2) - stringLength(key1)
const words = Object.keys(config).sort(
(word1, word2) => stringLength(word2) - stringLength(word1)
);
keys.forEach((key) => {
customDict[key] = config[key];
words.forEach((word) => {
customDict[word] = config[word];
});
const customPatterns = Object.keys(customDict).map((key) => ({
zh: key,
pinyin: customDict[key],
probability: Probability.Custom + stringLength(key),
length: key.length,
const customPatterns = Object.keys(customDict).map((word) => ({
zh: word,
pinyin: customDict[word],
probability: Probability.Custom + stringLength(word),
length: stringLength(word),
priority: Priority.Custom,
dict: CustomDictName,
}));
@@ -58,23 +58,21 @@ export function customPinyin(
function addCustomConfigToDict(
config: { [key: string]: string },
dict: string[],
dict: FastDictFactory,
handleType: CustomHandleType
) {
for (let key in config) {
const pinyins = config[key];
splitString(key).forEach((word, index) => {
for (let word in config) {
const pinyins = config[word];
splitString(word).forEach((char, index) => {
const pinyin = pinyins.split(' ')[index] || '';
const wordCode = word.charCodeAt(0);
if (handleType === 'replace' || (handleType === 'add' && !dict[wordCode] && !DICT1[wordCode])) {
if (handleType === 'replace' || (handleType === 'add' && !dict.get(char) && !DICT1.get(char))) {
// 直接覆盖原词典
dict[wordCode] = pinyin;
dict.set(char, pinyin);
} else {
// 补充至原词典
dict[wordCode] = dict[wordCode] || DICT1[wordCode];
if (!dict[wordCode].split(' ').includes(pinyin)) {
dict[wordCode] += ` ${pinyin}`;
dict[wordCode] = dict[wordCode].trim();
dict.set(char, dict.get(char) || DICT1.get(char));
if (!dict.get(char).split(' ').includes(pinyin)) {
dict.set(char, `${dict.get(char)} ${pinyin}`.trim());
}
}
});
@@ -91,15 +89,15 @@ export const getCustomPolyphonicDict = () => {
export function clearCustomDict(dict: CustomDictType | CustomDictType[]) {
if (dict === 'pinyin' || dict.indexOf('pinyin') !== -1) {
Object.keys(customDict).forEach(function (key) {
delete customDict[key];
Object.keys(customDict).forEach(function (word) {
delete customDict[word];
});
acTree.removeDict(CustomDictName);
}
if (dict === 'multiple' || dict.indexOf('multiple') !== -1) {
customMultipleDict.length = 0;
customMultipleDict.clear();
}
if (dict === 'polyphonic' || dict.indexOf('polyphonic') !== -1) {
customPolyphonicDict.length = 0;
customPolyphonicDict.clear();
}
}

View File

@@ -26,36 +26,36 @@ export function addDict(dict: DICT | {}, options?: string | DictOptions) {
const name = typeof options === "object" ? options.name : options;
const dictName = name || DefaultName;
const dict1Handle = (options as DictOptions)?.dict1 || "add";
for (let key in dict as DICT) {
const value = (dict as DICT)[key];
for (let word in dict as DICT) {
const value = (dict as DICT)[word];
const pinyin = Array.isArray(value) ? value[0] : value;
if (stringLength(key) === 1) {
if (stringLength(word) === 1) {
addToOriginDict(
dictName,
key,
word,
pinyin,
dict1Handle
);
}
if (Array.isArray(value)) {
patterns.push({
zh: key,
zh: word,
pinyin,
probability:
typeof value[1] === "number"
? value[1]
: Probability.DICT * key.length * key.length,
length: key.length,
: Probability.DICT * stringLength(word) * stringLength(word),
length: stringLength(word),
priority: Priority.Normal,
dict: dictName,
pos: typeof value[2] === "string" ? value[2] : "",
});
} else {
patterns.push({
zh: key,
zh: word,
pinyin,
probability: Probability.DICT * key.length * key.length,
length: key.length,
probability: Probability.DICT * stringLength(word) * stringLength(word),
length: stringLength(word),
priority: Priority.Normal,
dict: dictName,
});
@@ -71,7 +71,7 @@ export function removeDict(dictName?: string) {
function addToOriginDict(
dict: string | Symbol,
key: string,
char: string,
pinyin: string,
handle: "add" | "replace" | "ignore" = "add"
) {
@@ -79,26 +79,25 @@ function addToOriginDict(
originDictMap.set(dict, {})
}
const originDict = originDictMap.get(dict)!;
const code = key.charCodeAt(0);
if (!originDict[key]) {
originDict[key] = DICT1[code] as string;
if (!originDict[char]) {
originDict[char] = DICT1.get(char) as string;
}
if (handle === "add") {
if (DICT1[code] && !DICT1[code].split(' ').includes(pinyin)) {
DICT1[code] += ` ${pinyin}`;
} else if (!DICT1[code]) {
DICT1[code] = pinyin;
const existedPinyin = DICT1.get(char);
if (existedPinyin && !existedPinyin.split(' ').includes(pinyin)) {
DICT1.set(char, `${existedPinyin} ${pinyin}`);
} else if (!DICT1.get(char)) {
DICT1.set(char, pinyin);
}
} else if (handle === "replace") {
DICT1[code] = pinyin;
DICT1.set(char, pinyin);
}
}
function removeOriginDict(dict: string | Symbol) {
const originDict = originDictMap.get(dict) || {};
for (let key in originDict) {
const code = key.charCodeAt(0);
DICT1[code] = originDict[key];
delete originDict[key];
for (let char in originDict) {
DICT1.set(char, originDict[char]);
delete originDict[char];
}
}

View File

@@ -13,22 +13,20 @@ import { SingleWordResult } from "../../common/type";
import type { SurnameMode } from "../../common/type";
import { acTree, TokenizationAlgorithm } from "../../common/segmentit";
import {
DoubleUnicodePrefixReg,
DoubleUnicodeSuffixReg,
Priority,
} from "@/common/constant";
import { splitString } from "@/common/utils";
/**
* @description: 获取单个字符的拼音
* @param {string} word
* @param {string} char
* @return {string}
*/
type GetSingleWordPinyin = (word: string) => string;
export const getSingleWordPinyin: GetSingleWordPinyin = (word) => {
const wordCode = word.charCodeAt(0);
const pinyin = DICT1[wordCode];
type GetSingleWordPinyin = (char: string) => string;
export const getSingleWordPinyin: GetSingleWordPinyin = (char) => {
const pinyin = DICT1.get(char);
// 若查到, 则返回第一个拼音; 若未查到, 返回原字符
return pinyin ? pinyin.split(" ")[0] : word;
return pinyin ? pinyin.split(" ")[0] : char;
};
export const getPinyin = (
@@ -39,13 +37,14 @@ export const getPinyin = (
): SingleWordResult[] => {
const matches = acTree.search(word, surname, segmentit);
let matchIndex = 0;
for (let i = 0; i < word.length; ) {
const zhChars = splitString(word);
for (let i = 0; i < zhChars.length; ) {
const match = matches[matchIndex];
if (match && i === match.index) {
if (match.length === 1 && match.priority <= Priority.Normal) {
const char = word[i];
const char = zhChars[i];
let pinyin: string = "";
pinyin = processSepecialPinyin(char, word[i - 1], word[i + 1]);
pinyin = processSepecialPinyin(char, zhChars[i - 1], zhChars[i + 1]);
list[i] = {
origin: char,
result: pinyin,
@@ -59,32 +58,21 @@ export const getPinyin = (
const pinyins = match.pinyin.split(" ");
let pinyinIndex = 0;
for (let j = 0; j < match.length; j++) {
if (
DoubleUnicodePrefixReg.test(match.zh[j - 1]) &&
DoubleUnicodeSuffixReg.test(match.zh[j])
) {
list[i + j] = {
origin: match.zh[j],
result: "",
isZh: true,
originPinyin: "",
};
} else {
list[i + j] = {
origin: match.zh[j],
result: pinyins[pinyinIndex],
isZh: true,
originPinyin: pinyins[pinyinIndex],
};
pinyinIndex++;
}
const zhChars = splitString(match.zh);
list[i + j] = {
origin: zhChars[j],
result: pinyins[pinyinIndex],
isZh: true,
originPinyin: pinyins[pinyinIndex],
};
pinyinIndex++;
}
i += match.length;
matchIndex++;
} else {
const char = word[i];
const char = zhChars[i];
let pinyin: string = "";
pinyin = processSepecialPinyin(char, word[i - 1], word[i + 1]);
pinyin = processSepecialPinyin(char, zhChars[i - 1], zhChars[i + 1]);
list[i] = {
origin: char,
result: pinyin,
@@ -117,18 +105,17 @@ const getPinyinWithoutTone: GetPinyinWithoutTone = (pinyin) => {
/**
* @description: 获取单字符的多音拼音
* @param {string} word
* @param {string} char
* @return {WordResult[]}
*/
type GetAllPinyin = (word: string, surname?: SurnameMode) => string[];
export const getAllPinyin: GetAllPinyin = (word, surname = "off") => {
const wordCode = word.charCodeAt(0);
type GetAllPinyin = (char: string, surname?: SurnameMode) => string[];
export const getAllPinyin: GetAllPinyin = (char, surname = "off") => {
const customMultpileDict = getCustomMultpileDict();
let pinyin = DICT1[wordCode] ? DICT1[wordCode].split(" ") : [];
if (customMultpileDict[wordCode]) {
pinyin = customMultpileDict[wordCode].split(" ");
let pinyin = DICT1.get(char) ? DICT1.get(char).split(" ") : [];
if (customMultpileDict.get(char)) {
pinyin = customMultpileDict.get(char).split(" ");
} else if (surname !== "off") {
const surnamePinyin = Surnames[word];
const surnamePinyin = Surnames[char];
if (surnamePinyin) {
pinyin = [surnamePinyin].concat(
pinyin.filter((py) => py !== surnamePinyin)

View File

@@ -1,3 +1,4 @@
import { stringLength } from "@/common/utils";
import { TokenizationAlgorithm } from "../../common/segmentit";
import type {
SingleWordResult,
@@ -13,7 +14,6 @@ import {
middlewareToneType,
middlewareV,
middlewareType,
middlewareDoubleUnicode,
middlewareToneSandhi,
} from "./middlewares";
@@ -241,7 +241,7 @@ function pinyin(
options.nonZh = "removed";
}
let list: SingleWordResult[] = Array(word.length);
let list: SingleWordResult[] = Array(stringLength(word));
list = getPinyin(
word,
@@ -253,9 +253,6 @@ function pinyin(
// 一和不变调处理
list = middlewareToneSandhi(list, options.toneSandhi as boolean);
// 双 unicode 编码字符处理
list = middlewareDoubleUnicode(list);
// nonZh 参数及 removeNonZh 参数
list = middleWareNonZh(list, options);

View File

@@ -1,9 +1,5 @@
import { stringLength, isZhChar } from "@/common/utils";
import { stringLength } from "@/common/utils";
import type { SingleWordResult } from "../../common/type";
import {
DoubleUnicodePrefixReg,
DoubleUnicodeSuffixReg,
} from "@/common/constant";
import { getAllPinyin, getMultiplePinyin } from "./handle";
import { CompleteOptions } from "./index";
import {
@@ -14,6 +10,7 @@ import {
getPinyinWithoutTone,
getPinyinWithNum,
} from "./handle";
import DICT1 from "@/data/dict1";
// 验证输入是否为字符串
export const validateType = (word: unknown) => {
@@ -202,37 +199,13 @@ export const middlewareType = (
num: Number(getNumOfTone(item.originPinyin)),
isZh: item.isZh,
polyphonic,
inZhRange: isZhChar(item.origin),
inZhRange: !!DICT1.get(item.origin),
};
});
}
return list.map((item) => item.result).join(options.separator);
};
// 处理双 Unicode 编码字符,将第二个删除
export const middlewareDoubleUnicode = (
list: SingleWordResult[]
): SingleWordResult[] => {
for (let i = list.length - 2; i >= 0; i--) {
const cur = list[i];
const next = list[i + 1];
if (
DoubleUnicodePrefixReg.test(cur.origin) &&
DoubleUnicodeSuffixReg.test(next.origin)
) {
cur.origin += next.origin;
cur.result += next.result;
cur.originPinyin = cur.result;
next.delete = true;
i--;
}
}
list = list.filter((item) => {
return !item.delete;
});
return list;
};
// 是否开启变调
export const middlewareToneSandhi = (
list: SingleWordResult[],

View File

@@ -5,7 +5,6 @@ import {
middlewarePattern,
middlewareToneType,
middlewareV,
middlewareDoubleUnicode,
} from '@/core/pinyin/middlewares';
import DICT1 from '@/data/dict1';
import {
@@ -15,7 +14,7 @@ import {
getFinalParts,
} from '@/core/pinyin/handle';
import { getCustomPolyphonicDict } from '../custom';
import { isZhChar, splitString } from '@/common/utils';
import { splitString } from '@/common/utils';
interface BasicOptions {
/**
@@ -177,8 +176,6 @@ function polyphonic(
let list = getPolyphonicList(text);
list = middlewareDoubleUnicode(list);
// nonZh 参数及 removeNonZh 参数
list = middleWareNonZh(list, options);
@@ -206,14 +203,13 @@ function polyphonic(
// 获取每个字多音字的数组
const getPolyphonicList = (text: string): SingleWordResult[] => {
return splitString(text).map((word) => {
const wordCode = word.charCodeAt(0);
return splitString(text).map((char) => {
const customPolyphonicDict = getCustomPolyphonicDict();
const pinyin = customPolyphonicDict[wordCode] || DICT1[wordCode] || word;
const pinyin = customPolyphonicDict.get(char) || DICT1.get(char) || char;
return {
origin: word,
origin: char,
result: pinyin,
isZh: pinyin !== word,
isZh: pinyin !== char,
originPinyin: pinyin,
};
});
@@ -259,7 +255,7 @@ export const handleType = (
finalTail: tail,
num: Number(getNumOfTone(item.originPinyin)),
isZh: item.isZh,
inZhRange: isZhChar(item.origin),
inZhRange: !!DICT1.get(item.origin),
};
});
}

View File

File diff suppressed because it is too large Load Diff

View File

@@ -5,7 +5,7 @@ import {
getNumOfTone,
getPinyinWithoutTone,
} from '@/core/pinyin/handle';
import { isZhChar } from '@/common/utils';
import DICT1 from './dict1';
export const InitialList = [
'zh',
@@ -225,7 +225,7 @@ export function processToneSandhi(cur: string, pre: string, next: string) {
// 处理「了」字的变调
export function processToneSandhiLiao(cur: string, pre: string) {
if (cur === '了' && !isZhChar(pre)) {
if (cur === '了' && (!pre || !DICT1.get(pre))) {
return 'liǎo';
}
}

View File

@@ -60,4 +60,14 @@ describe("addDict", () => {
expect(result).to.be.equal("yī");
removeDict();
});
it("[addDict]2 unicode dict", () => {
const stringDict = {
𧒽: 'lei'
}
addDict(stringDict, { name: 'double-unicode-dict' });
const result = pinyin("𧒽");
expect(result).to.be.equal("lei");
removeDict();
});
});

View File

@@ -28,4 +28,10 @@ describe('double unicode', () => {
});
expect(result5).to.be.equal('cè shì a𧒽𧒽a cè shì a𧒽𧒽a cè shì');
});
it('[double unicode]dp consecutive', () => {
const result4 = pinyin('测试𬭬𬭬测试𬭬测试');
expect(result4).to.be.equal('cè shì huì huì cè shì huì cè shì');
});
});

View File

@@ -1,6 +1,7 @@
export declare const DoubleUnicodePrefixReg: RegExp;
export declare const DoubleUnicodeSuffixReg: RegExp;
export declare const DoubleUnicodeReg: RegExp;
export declare const DoubleUnicodeCharReg: RegExp;
export declare const enum Probability {
Unknown = 1e-13,
Rule = 1e-12,

View File

@@ -1,3 +1,10 @@
export declare function stringLength(text: string): number;
export declare function splitString(text: string): string[];
export declare function isZhChar(char: string): boolean;
export declare class FastDictFactory {
NumberDICT: string[];
StringDICT: Map<string, string>;
constructor();
get(word: string): string;
set(word: string, pinyin: string): void;
clear(): void;
}

View File

@@ -1,3 +1,4 @@
import { FastDictFactory } from '@/common/utils';
type CustomHandleType = 'add' | 'replace';
type CustomDictType = 'pinyin' | 'multiple' | 'polyphonic';
interface CustomPinyinOptions {
@@ -16,9 +17,9 @@ interface CustomPinyinOptions {
* @param {CustomPinyinOptions} options multiple/polyphonic 对于 customPinyin 补充词汇的处理
*/
export declare function customPinyin(config?: {
[key: string]: string;
[word: string]: string;
}, options?: CustomPinyinOptions): void;
export declare const getCustomMultpileDict: () => string[];
export declare const getCustomPolyphonicDict: () => string[];
export declare const getCustomMultpileDict: () => FastDictFactory;
export declare const getCustomPolyphonicDict: () => FastDictFactory;
export declare function clearCustomDict(dict: CustomDictType | CustomDictType[]): void;
export {};

View File

@@ -3,10 +3,10 @@ import type { SurnameMode } from "../../common/type";
import { TokenizationAlgorithm } from "../../common/segmentit";
/**
* @description: 获取单个字符的拼音
* @param {string} word
* @param {string} char
* @return {string}
*/
type GetSingleWordPinyin = (word: string) => string;
type GetSingleWordPinyin = (char: string) => string;
export declare const getSingleWordPinyin: GetSingleWordPinyin;
export declare const getPinyin: (word: string, list: SingleWordResult[], surname: SurnameMode, segmentit: TokenizationAlgorithm) => SingleWordResult[];
/**
@@ -18,10 +18,10 @@ type GetPinyinWithoutTone = (pinyin: string) => string;
declare const getPinyinWithoutTone: GetPinyinWithoutTone;
/**
* @description: 获取单字符的多音拼音
* @param {string} word
* @param {string} char
* @return {WordResult[]}
*/
type GetAllPinyin = (word: string, surname?: SurnameMode) => string[];
type GetAllPinyin = (char: string, surname?: SurnameMode) => string[];
export declare const getAllPinyin: GetAllPinyin;
/**
* @description: 获取单字符的多音拼音

View File

@@ -20,5 +20,4 @@ export declare const middlewareType: (list: SingleWordResult[], options: Complet
polyphonic: string[];
inZhRange: boolean;
}[];
export declare const middlewareDoubleUnicode: (list: SingleWordResult[]) => SingleWordResult[];
export declare const middlewareToneSandhi: (list: SingleWordResult[], toneSandhi: boolean) => SingleWordResult[];

View File

@@ -1,2 +1,3 @@
declare const DICT1: string[];
import { FastDictFactory } from "@/common/utils";
declare const DICT1: FastDictFactory;
export default DICT1;