mirror of
https://github.com/zh-lx/pinyin-pro.git
synced 2026-03-13 09:51:38 +08:00
@@ -1,6 +1,7 @@
|
||||
export const DoubleUnicodePrefixReg = /^[\uD800-\uDBFF]$/;
|
||||
export const DoubleUnicodeSuffixReg = /^[\uDC00-\uDFFF]$/;
|
||||
export const DoubleUnicodeReg = /[\uD800-\uDBFF][\uDC00-\uDFFF]/g;
|
||||
export const DoubleUnicodeCharReg = /^[\uD800-\uDBFF][\uDC00-\uDFFF]$/g;
|
||||
export const enum Probability {
|
||||
Unknown = 1e-13,
|
||||
Rule = 1e-12,
|
||||
|
||||
@@ -9,6 +9,7 @@ import { minTokenization } from "./min-tokenization";
|
||||
import { reverseMaxMatch } from "./reverse-max-match";
|
||||
import { Priority } from "@/common/constant";
|
||||
import type { SurnameMode } from "../type";
|
||||
import { splitString, stringLength } from "../utils";
|
||||
|
||||
export const enum TokenizationAlgorithm {
|
||||
ReverseMaxMatch = 1,
|
||||
@@ -69,12 +70,12 @@ export class AC {
|
||||
// 构建 trie 树
|
||||
buildTrie(patternList: Pattern[]) {
|
||||
for (let pattern of patternList) {
|
||||
const { zh } = pattern;
|
||||
const zhChars = splitString(pattern.zh);
|
||||
let cur = this.root;
|
||||
for (let i = 0; i < zh.length; i++) {
|
||||
let c = zh.charAt(i);
|
||||
for (let i = 0; i < zhChars.length; i++) {
|
||||
let c = zhChars[i];
|
||||
if (!cur.children.has(c)) {
|
||||
const trieNode = new TrieNode(cur, zh.slice(0, i), c);
|
||||
const trieNode = new TrieNode(cur, zhChars.slice(0, i).join(''), c);
|
||||
cur.children.set(c, trieNode);
|
||||
this.addNodeToQueues(trieNode);
|
||||
}
|
||||
@@ -161,8 +162,9 @@ export class AC {
|
||||
match(text: string, surname: SurnameMode) {
|
||||
let cur = this.root;
|
||||
let result: MatchPattern[] = [];
|
||||
for (let i = 0; i < text.length; i++) {
|
||||
let c = text.charAt(i);
|
||||
const zhChars = splitString(text);
|
||||
for (let i = 0; i < zhChars.length; i++) {
|
||||
let c = zhChars[i];
|
||||
|
||||
while (cur !== null && !cur.children.has(c)) {
|
||||
cur = cur.fail as TrieNode;
|
||||
@@ -220,9 +222,9 @@ export class AC {
|
||||
if (algorithm === TokenizationAlgorithm.ReverseMaxMatch) {
|
||||
return reverseMaxMatch(patterns);
|
||||
} else if (algorithm === TokenizationAlgorithm.MinTokenization) {
|
||||
return minTokenization(patterns, text.length);
|
||||
return minTokenization(patterns, stringLength(text));
|
||||
}
|
||||
return maxProbability(patterns, text.length);
|
||||
return maxProbability(patterns, stringLength(text));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
import { DoubleUnicodePrefixReg, DoubleUnicodeSuffixReg, DoubleUnicodeReg } from './constant';
|
||||
import {
|
||||
DoubleUnicodePrefixReg,
|
||||
DoubleUnicodeSuffixReg,
|
||||
DoubleUnicodeReg,
|
||||
} from "./constant";
|
||||
|
||||
export function stringLength(text: string) {
|
||||
return text.replace(DoubleUnicodeReg, '_').length;
|
||||
return text.replace(DoubleUnicodeReg, "_").length;
|
||||
}
|
||||
|
||||
// 双音节字符处理
|
||||
@@ -10,7 +14,10 @@ export function splitString(text: string): string[] {
|
||||
let i = 0;
|
||||
while (i < text.length) {
|
||||
const char = text.charAt(i);
|
||||
if (DoubleUnicodePrefixReg.test(char) && DoubleUnicodeSuffixReg.test(text.charAt(i + 1))) {
|
||||
if (
|
||||
DoubleUnicodePrefixReg.test(char) &&
|
||||
DoubleUnicodeSuffixReg.test(text.charAt(i + 1))
|
||||
) {
|
||||
result.push(text.substring(i, i + 2));
|
||||
i += 2;
|
||||
} else {
|
||||
@@ -21,10 +28,35 @@ export function splitString(text: string): string[] {
|
||||
return result;
|
||||
}
|
||||
|
||||
export function isZhChar(char: string) {
|
||||
if (typeof char !== 'string') {
|
||||
return false;
|
||||
export class FastDictFactory {
|
||||
NumberDICT: string[];
|
||||
StringDICT: Map<string, string>;
|
||||
|
||||
constructor() {
|
||||
this.NumberDICT = [];
|
||||
this.StringDICT = new Map();
|
||||
}
|
||||
let code = char.charCodeAt(0);
|
||||
return code >= 19968 && code <= 40869;
|
||||
}
|
||||
|
||||
get(word: string): string {
|
||||
if (word.length > 1) {
|
||||
return this.StringDICT.get(word) as string;
|
||||
} else {
|
||||
const code = word.charCodeAt(0);
|
||||
return this.NumberDICT[code];
|
||||
}
|
||||
}
|
||||
|
||||
set(word: string, pinyin: string) {
|
||||
if (word.length > 1) {
|
||||
this.StringDICT.set(word, pinyin);
|
||||
} else {
|
||||
const code = word.charCodeAt(0);
|
||||
this.NumberDICT[code] = pinyin;
|
||||
}
|
||||
}
|
||||
|
||||
clear() {
|
||||
this.NumberDICT = [];
|
||||
this.StringDICT.clear();
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,10 @@
|
||||
import { acTree } from '@/common/segmentit';
|
||||
import { Probability, Priority } from '@/common/constant';
|
||||
import { splitString, stringLength } from '@/common/utils';
|
||||
import { FastDictFactory, splitString, stringLength } from '@/common/utils';
|
||||
import DICT1 from '@/data/dict1';
|
||||
let customDict: { [key: string]: string } = {};
|
||||
let customMultipleDict: string[] = [];
|
||||
let customPolyphonicDict: string[] = [];
|
||||
const customMultipleDict = new FastDictFactory();
|
||||
const customPolyphonicDict = new FastDictFactory();
|
||||
|
||||
type CustomHandleType = 'add' | 'replace';
|
||||
|
||||
@@ -29,20 +29,20 @@ const CustomDictName = Symbol('custom');
|
||||
* @param {CustomPinyinOptions} options multiple/polyphonic 对于 customPinyin 补充词汇的处理
|
||||
*/
|
||||
export function customPinyin(
|
||||
config: { [key: string]: string } = {},
|
||||
config: { [word: string]: string } = {},
|
||||
options?: CustomPinyinOptions
|
||||
) {
|
||||
const keys = Object.keys(config).sort(
|
||||
(key1, key2) => stringLength(key2) - stringLength(key1)
|
||||
const words = Object.keys(config).sort(
|
||||
(word1, word2) => stringLength(word2) - stringLength(word1)
|
||||
);
|
||||
keys.forEach((key) => {
|
||||
customDict[key] = config[key];
|
||||
words.forEach((word) => {
|
||||
customDict[word] = config[word];
|
||||
});
|
||||
const customPatterns = Object.keys(customDict).map((key) => ({
|
||||
zh: key,
|
||||
pinyin: customDict[key],
|
||||
probability: Probability.Custom + stringLength(key),
|
||||
length: key.length,
|
||||
const customPatterns = Object.keys(customDict).map((word) => ({
|
||||
zh: word,
|
||||
pinyin: customDict[word],
|
||||
probability: Probability.Custom + stringLength(word),
|
||||
length: stringLength(word),
|
||||
priority: Priority.Custom,
|
||||
dict: CustomDictName,
|
||||
}));
|
||||
@@ -58,23 +58,21 @@ export function customPinyin(
|
||||
|
||||
function addCustomConfigToDict(
|
||||
config: { [key: string]: string },
|
||||
dict: string[],
|
||||
dict: FastDictFactory,
|
||||
handleType: CustomHandleType
|
||||
) {
|
||||
for (let key in config) {
|
||||
const pinyins = config[key];
|
||||
splitString(key).forEach((word, index) => {
|
||||
for (let word in config) {
|
||||
const pinyins = config[word];
|
||||
splitString(word).forEach((char, index) => {
|
||||
const pinyin = pinyins.split(' ')[index] || '';
|
||||
const wordCode = word.charCodeAt(0);
|
||||
if (handleType === 'replace' || (handleType === 'add' && !dict[wordCode] && !DICT1[wordCode])) {
|
||||
if (handleType === 'replace' || (handleType === 'add' && !dict.get(char) && !DICT1.get(char))) {
|
||||
// 直接覆盖原词典
|
||||
dict[wordCode] = pinyin;
|
||||
dict.set(char, pinyin);
|
||||
} else {
|
||||
// 补充至原词典
|
||||
dict[wordCode] = dict[wordCode] || DICT1[wordCode];
|
||||
if (!dict[wordCode].split(' ').includes(pinyin)) {
|
||||
dict[wordCode] += ` ${pinyin}`;
|
||||
dict[wordCode] = dict[wordCode].trim();
|
||||
dict.set(char, dict.get(char) || DICT1.get(char));
|
||||
if (!dict.get(char).split(' ').includes(pinyin)) {
|
||||
dict.set(char, `${dict.get(char)} ${pinyin}`.trim());
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -91,15 +89,15 @@ export const getCustomPolyphonicDict = () => {
|
||||
|
||||
export function clearCustomDict(dict: CustomDictType | CustomDictType[]) {
|
||||
if (dict === 'pinyin' || dict.indexOf('pinyin') !== -1) {
|
||||
Object.keys(customDict).forEach(function (key) {
|
||||
delete customDict[key];
|
||||
Object.keys(customDict).forEach(function (word) {
|
||||
delete customDict[word];
|
||||
});
|
||||
acTree.removeDict(CustomDictName);
|
||||
}
|
||||
if (dict === 'multiple' || dict.indexOf('multiple') !== -1) {
|
||||
customMultipleDict.length = 0;
|
||||
customMultipleDict.clear();
|
||||
}
|
||||
if (dict === 'polyphonic' || dict.indexOf('polyphonic') !== -1) {
|
||||
customPolyphonicDict.length = 0;
|
||||
customPolyphonicDict.clear();
|
||||
}
|
||||
}
|
||||
@@ -26,36 +26,36 @@ export function addDict(dict: DICT | {}, options?: string | DictOptions) {
|
||||
const name = typeof options === "object" ? options.name : options;
|
||||
const dictName = name || DefaultName;
|
||||
const dict1Handle = (options as DictOptions)?.dict1 || "add";
|
||||
for (let key in dict as DICT) {
|
||||
const value = (dict as DICT)[key];
|
||||
for (let word in dict as DICT) {
|
||||
const value = (dict as DICT)[word];
|
||||
const pinyin = Array.isArray(value) ? value[0] : value;
|
||||
if (stringLength(key) === 1) {
|
||||
if (stringLength(word) === 1) {
|
||||
addToOriginDict(
|
||||
dictName,
|
||||
key,
|
||||
word,
|
||||
pinyin,
|
||||
dict1Handle
|
||||
);
|
||||
}
|
||||
if (Array.isArray(value)) {
|
||||
patterns.push({
|
||||
zh: key,
|
||||
zh: word,
|
||||
pinyin,
|
||||
probability:
|
||||
typeof value[1] === "number"
|
||||
? value[1]
|
||||
: Probability.DICT * key.length * key.length,
|
||||
length: key.length,
|
||||
: Probability.DICT * stringLength(word) * stringLength(word),
|
||||
length: stringLength(word),
|
||||
priority: Priority.Normal,
|
||||
dict: dictName,
|
||||
pos: typeof value[2] === "string" ? value[2] : "",
|
||||
});
|
||||
} else {
|
||||
patterns.push({
|
||||
zh: key,
|
||||
zh: word,
|
||||
pinyin,
|
||||
probability: Probability.DICT * key.length * key.length,
|
||||
length: key.length,
|
||||
probability: Probability.DICT * stringLength(word) * stringLength(word),
|
||||
length: stringLength(word),
|
||||
priority: Priority.Normal,
|
||||
dict: dictName,
|
||||
});
|
||||
@@ -71,7 +71,7 @@ export function removeDict(dictName?: string) {
|
||||
|
||||
function addToOriginDict(
|
||||
dict: string | Symbol,
|
||||
key: string,
|
||||
char: string,
|
||||
pinyin: string,
|
||||
handle: "add" | "replace" | "ignore" = "add"
|
||||
) {
|
||||
@@ -79,26 +79,25 @@ function addToOriginDict(
|
||||
originDictMap.set(dict, {})
|
||||
}
|
||||
const originDict = originDictMap.get(dict)!;
|
||||
const code = key.charCodeAt(0);
|
||||
if (!originDict[key]) {
|
||||
originDict[key] = DICT1[code] as string;
|
||||
if (!originDict[char]) {
|
||||
originDict[char] = DICT1.get(char) as string;
|
||||
}
|
||||
if (handle === "add") {
|
||||
if (DICT1[code] && !DICT1[code].split(' ').includes(pinyin)) {
|
||||
DICT1[code] += ` ${pinyin}`;
|
||||
} else if (!DICT1[code]) {
|
||||
DICT1[code] = pinyin;
|
||||
const existedPinyin = DICT1.get(char);
|
||||
if (existedPinyin && !existedPinyin.split(' ').includes(pinyin)) {
|
||||
DICT1.set(char, `${existedPinyin} ${pinyin}`);
|
||||
} else if (!DICT1.get(char)) {
|
||||
DICT1.set(char, pinyin);
|
||||
}
|
||||
} else if (handle === "replace") {
|
||||
DICT1[code] = pinyin;
|
||||
DICT1.set(char, pinyin);
|
||||
}
|
||||
}
|
||||
|
||||
function removeOriginDict(dict: string | Symbol) {
|
||||
const originDict = originDictMap.get(dict) || {};
|
||||
for (let key in originDict) {
|
||||
const code = key.charCodeAt(0);
|
||||
DICT1[code] = originDict[key];
|
||||
delete originDict[key];
|
||||
for (let char in originDict) {
|
||||
DICT1.set(char, originDict[char]);
|
||||
delete originDict[char];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,22 +13,20 @@ import { SingleWordResult } from "../../common/type";
|
||||
import type { SurnameMode } from "../../common/type";
|
||||
import { acTree, TokenizationAlgorithm } from "../../common/segmentit";
|
||||
import {
|
||||
DoubleUnicodePrefixReg,
|
||||
DoubleUnicodeSuffixReg,
|
||||
Priority,
|
||||
} from "@/common/constant";
|
||||
import { splitString } from "@/common/utils";
|
||||
|
||||
/**
|
||||
* @description: 获取单个字符的拼音
|
||||
* @param {string} word
|
||||
* @param {string} char
|
||||
* @return {string}
|
||||
*/
|
||||
type GetSingleWordPinyin = (word: string) => string;
|
||||
export const getSingleWordPinyin: GetSingleWordPinyin = (word) => {
|
||||
const wordCode = word.charCodeAt(0);
|
||||
const pinyin = DICT1[wordCode];
|
||||
type GetSingleWordPinyin = (char: string) => string;
|
||||
export const getSingleWordPinyin: GetSingleWordPinyin = (char) => {
|
||||
const pinyin = DICT1.get(char);
|
||||
// 若查到, 则返回第一个拼音; 若未查到, 返回原字符
|
||||
return pinyin ? pinyin.split(" ")[0] : word;
|
||||
return pinyin ? pinyin.split(" ")[0] : char;
|
||||
};
|
||||
|
||||
export const getPinyin = (
|
||||
@@ -39,13 +37,14 @@ export const getPinyin = (
|
||||
): SingleWordResult[] => {
|
||||
const matches = acTree.search(word, surname, segmentit);
|
||||
let matchIndex = 0;
|
||||
for (let i = 0; i < word.length; ) {
|
||||
const zhChars = splitString(word);
|
||||
for (let i = 0; i < zhChars.length; ) {
|
||||
const match = matches[matchIndex];
|
||||
if (match && i === match.index) {
|
||||
if (match.length === 1 && match.priority <= Priority.Normal) {
|
||||
const char = word[i];
|
||||
const char = zhChars[i];
|
||||
let pinyin: string = "";
|
||||
pinyin = processSepecialPinyin(char, word[i - 1], word[i + 1]);
|
||||
pinyin = processSepecialPinyin(char, zhChars[i - 1], zhChars[i + 1]);
|
||||
list[i] = {
|
||||
origin: char,
|
||||
result: pinyin,
|
||||
@@ -59,32 +58,21 @@ export const getPinyin = (
|
||||
const pinyins = match.pinyin.split(" ");
|
||||
let pinyinIndex = 0;
|
||||
for (let j = 0; j < match.length; j++) {
|
||||
if (
|
||||
DoubleUnicodePrefixReg.test(match.zh[j - 1]) &&
|
||||
DoubleUnicodeSuffixReg.test(match.zh[j])
|
||||
) {
|
||||
list[i + j] = {
|
||||
origin: match.zh[j],
|
||||
result: "",
|
||||
isZh: true,
|
||||
originPinyin: "",
|
||||
};
|
||||
} else {
|
||||
list[i + j] = {
|
||||
origin: match.zh[j],
|
||||
result: pinyins[pinyinIndex],
|
||||
isZh: true,
|
||||
originPinyin: pinyins[pinyinIndex],
|
||||
};
|
||||
pinyinIndex++;
|
||||
}
|
||||
const zhChars = splitString(match.zh);
|
||||
list[i + j] = {
|
||||
origin: zhChars[j],
|
||||
result: pinyins[pinyinIndex],
|
||||
isZh: true,
|
||||
originPinyin: pinyins[pinyinIndex],
|
||||
};
|
||||
pinyinIndex++;
|
||||
}
|
||||
i += match.length;
|
||||
matchIndex++;
|
||||
} else {
|
||||
const char = word[i];
|
||||
const char = zhChars[i];
|
||||
let pinyin: string = "";
|
||||
pinyin = processSepecialPinyin(char, word[i - 1], word[i + 1]);
|
||||
pinyin = processSepecialPinyin(char, zhChars[i - 1], zhChars[i + 1]);
|
||||
list[i] = {
|
||||
origin: char,
|
||||
result: pinyin,
|
||||
@@ -117,18 +105,17 @@ const getPinyinWithoutTone: GetPinyinWithoutTone = (pinyin) => {
|
||||
|
||||
/**
|
||||
* @description: 获取单字符的多音拼音
|
||||
* @param {string} word
|
||||
* @param {string} char
|
||||
* @return {WordResult[]}
|
||||
*/
|
||||
type GetAllPinyin = (word: string, surname?: SurnameMode) => string[];
|
||||
export const getAllPinyin: GetAllPinyin = (word, surname = "off") => {
|
||||
const wordCode = word.charCodeAt(0);
|
||||
type GetAllPinyin = (char: string, surname?: SurnameMode) => string[];
|
||||
export const getAllPinyin: GetAllPinyin = (char, surname = "off") => {
|
||||
const customMultpileDict = getCustomMultpileDict();
|
||||
let pinyin = DICT1[wordCode] ? DICT1[wordCode].split(" ") : [];
|
||||
if (customMultpileDict[wordCode]) {
|
||||
pinyin = customMultpileDict[wordCode].split(" ");
|
||||
let pinyin = DICT1.get(char) ? DICT1.get(char).split(" ") : [];
|
||||
if (customMultpileDict.get(char)) {
|
||||
pinyin = customMultpileDict.get(char).split(" ");
|
||||
} else if (surname !== "off") {
|
||||
const surnamePinyin = Surnames[word];
|
||||
const surnamePinyin = Surnames[char];
|
||||
if (surnamePinyin) {
|
||||
pinyin = [surnamePinyin].concat(
|
||||
pinyin.filter((py) => py !== surnamePinyin)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import { stringLength } from "@/common/utils";
|
||||
import { TokenizationAlgorithm } from "../../common/segmentit";
|
||||
import type {
|
||||
SingleWordResult,
|
||||
@@ -13,7 +14,6 @@ import {
|
||||
middlewareToneType,
|
||||
middlewareV,
|
||||
middlewareType,
|
||||
middlewareDoubleUnicode,
|
||||
middlewareToneSandhi,
|
||||
} from "./middlewares";
|
||||
|
||||
@@ -241,7 +241,7 @@ function pinyin(
|
||||
options.nonZh = "removed";
|
||||
}
|
||||
|
||||
let list: SingleWordResult[] = Array(word.length);
|
||||
let list: SingleWordResult[] = Array(stringLength(word));
|
||||
|
||||
list = getPinyin(
|
||||
word,
|
||||
@@ -253,9 +253,6 @@ function pinyin(
|
||||
// 一和不变调处理
|
||||
list = middlewareToneSandhi(list, options.toneSandhi as boolean);
|
||||
|
||||
// 双 unicode 编码字符处理
|
||||
list = middlewareDoubleUnicode(list);
|
||||
|
||||
// nonZh 参数及 removeNonZh 参数
|
||||
list = middleWareNonZh(list, options);
|
||||
|
||||
|
||||
@@ -1,9 +1,5 @@
|
||||
import { stringLength, isZhChar } from "@/common/utils";
|
||||
import { stringLength } from "@/common/utils";
|
||||
import type { SingleWordResult } from "../../common/type";
|
||||
import {
|
||||
DoubleUnicodePrefixReg,
|
||||
DoubleUnicodeSuffixReg,
|
||||
} from "@/common/constant";
|
||||
import { getAllPinyin, getMultiplePinyin } from "./handle";
|
||||
import { CompleteOptions } from "./index";
|
||||
import {
|
||||
@@ -14,6 +10,7 @@ import {
|
||||
getPinyinWithoutTone,
|
||||
getPinyinWithNum,
|
||||
} from "./handle";
|
||||
import DICT1 from "@/data/dict1";
|
||||
|
||||
// 验证输入是否为字符串
|
||||
export const validateType = (word: unknown) => {
|
||||
@@ -202,37 +199,13 @@ export const middlewareType = (
|
||||
num: Number(getNumOfTone(item.originPinyin)),
|
||||
isZh: item.isZh,
|
||||
polyphonic,
|
||||
inZhRange: isZhChar(item.origin),
|
||||
inZhRange: !!DICT1.get(item.origin),
|
||||
};
|
||||
});
|
||||
}
|
||||
return list.map((item) => item.result).join(options.separator);
|
||||
};
|
||||
|
||||
// 处理双 Unicode 编码字符,将第二个删除
|
||||
export const middlewareDoubleUnicode = (
|
||||
list: SingleWordResult[]
|
||||
): SingleWordResult[] => {
|
||||
for (let i = list.length - 2; i >= 0; i--) {
|
||||
const cur = list[i];
|
||||
const next = list[i + 1];
|
||||
if (
|
||||
DoubleUnicodePrefixReg.test(cur.origin) &&
|
||||
DoubleUnicodeSuffixReg.test(next.origin)
|
||||
) {
|
||||
cur.origin += next.origin;
|
||||
cur.result += next.result;
|
||||
cur.originPinyin = cur.result;
|
||||
next.delete = true;
|
||||
i--;
|
||||
}
|
||||
}
|
||||
list = list.filter((item) => {
|
||||
return !item.delete;
|
||||
});
|
||||
return list;
|
||||
};
|
||||
|
||||
// 是否开启变调
|
||||
export const middlewareToneSandhi = (
|
||||
list: SingleWordResult[],
|
||||
|
||||
@@ -5,7 +5,6 @@ import {
|
||||
middlewarePattern,
|
||||
middlewareToneType,
|
||||
middlewareV,
|
||||
middlewareDoubleUnicode,
|
||||
} from '@/core/pinyin/middlewares';
|
||||
import DICT1 from '@/data/dict1';
|
||||
import {
|
||||
@@ -15,7 +14,7 @@ import {
|
||||
getFinalParts,
|
||||
} from '@/core/pinyin/handle';
|
||||
import { getCustomPolyphonicDict } from '../custom';
|
||||
import { isZhChar, splitString } from '@/common/utils';
|
||||
import { splitString } from '@/common/utils';
|
||||
|
||||
interface BasicOptions {
|
||||
/**
|
||||
@@ -177,8 +176,6 @@ function polyphonic(
|
||||
|
||||
let list = getPolyphonicList(text);
|
||||
|
||||
list = middlewareDoubleUnicode(list);
|
||||
|
||||
// nonZh 参数及 removeNonZh 参数
|
||||
list = middleWareNonZh(list, options);
|
||||
|
||||
@@ -206,14 +203,13 @@ function polyphonic(
|
||||
|
||||
// 获取每个字多音字的数组
|
||||
const getPolyphonicList = (text: string): SingleWordResult[] => {
|
||||
return splitString(text).map((word) => {
|
||||
const wordCode = word.charCodeAt(0);
|
||||
return splitString(text).map((char) => {
|
||||
const customPolyphonicDict = getCustomPolyphonicDict();
|
||||
const pinyin = customPolyphonicDict[wordCode] || DICT1[wordCode] || word;
|
||||
const pinyin = customPolyphonicDict.get(char) || DICT1.get(char) || char;
|
||||
return {
|
||||
origin: word,
|
||||
origin: char,
|
||||
result: pinyin,
|
||||
isZh: pinyin !== word,
|
||||
isZh: pinyin !== char,
|
||||
originPinyin: pinyin,
|
||||
};
|
||||
});
|
||||
@@ -259,7 +255,7 @@ export const handleType = (
|
||||
finalTail: tail,
|
||||
num: Number(getNumOfTone(item.originPinyin)),
|
||||
isZh: item.isZh,
|
||||
inZhRange: isZhChar(item.origin),
|
||||
inZhRange: !!DICT1.get(item.origin),
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
22687
lib/data/dict1.ts
22687
lib/data/dict1.ts
File diff suppressed because it is too large
Load Diff
@@ -5,7 +5,7 @@ import {
|
||||
getNumOfTone,
|
||||
getPinyinWithoutTone,
|
||||
} from '@/core/pinyin/handle';
|
||||
import { isZhChar } from '@/common/utils';
|
||||
import DICT1 from './dict1';
|
||||
|
||||
export const InitialList = [
|
||||
'zh',
|
||||
@@ -225,7 +225,7 @@ export function processToneSandhi(cur: string, pre: string, next: string) {
|
||||
|
||||
// 处理「了」字的变调
|
||||
export function processToneSandhiLiao(cur: string, pre: string) {
|
||||
if (cur === '了' && !isZhChar(pre)) {
|
||||
if (cur === '了' && (!pre || !DICT1.get(pre))) {
|
||||
return 'liǎo';
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,4 +60,14 @@ describe("addDict", () => {
|
||||
expect(result).to.be.equal("yī");
|
||||
removeDict();
|
||||
});
|
||||
|
||||
it("[addDict]2 unicode dict", () => {
|
||||
const stringDict = {
|
||||
𧒽: 'lei'
|
||||
}
|
||||
addDict(stringDict, { name: 'double-unicode-dict' });
|
||||
const result = pinyin("𧒽");
|
||||
expect(result).to.be.equal("lei");
|
||||
removeDict();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -28,4 +28,10 @@ describe('double unicode', () => {
|
||||
});
|
||||
expect(result5).to.be.equal('cè shì a𧒽𧒽a cè shì a𧒽𧒽a cè shì');
|
||||
});
|
||||
|
||||
it('[double unicode]dp consecutive', () => {
|
||||
const result4 = pinyin('测试𬭬𬭬测试𬭬测试');
|
||||
expect(result4).to.be.equal('cè shì huì huì cè shì huì cè shì');
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
1
types/common/constant.d.ts
vendored
1
types/common/constant.d.ts
vendored
@@ -1,6 +1,7 @@
|
||||
export declare const DoubleUnicodePrefixReg: RegExp;
|
||||
export declare const DoubleUnicodeSuffixReg: RegExp;
|
||||
export declare const DoubleUnicodeReg: RegExp;
|
||||
export declare const DoubleUnicodeCharReg: RegExp;
|
||||
export declare const enum Probability {
|
||||
Unknown = 1e-13,
|
||||
Rule = 1e-12,
|
||||
|
||||
9
types/common/utils.d.ts
vendored
9
types/common/utils.d.ts
vendored
@@ -1,3 +1,10 @@
|
||||
export declare function stringLength(text: string): number;
|
||||
export declare function splitString(text: string): string[];
|
||||
export declare function isZhChar(char: string): boolean;
|
||||
export declare class FastDictFactory {
|
||||
NumberDICT: string[];
|
||||
StringDICT: Map<string, string>;
|
||||
constructor();
|
||||
get(word: string): string;
|
||||
set(word: string, pinyin: string): void;
|
||||
clear(): void;
|
||||
}
|
||||
|
||||
7
types/core/custom/index.d.ts
vendored
7
types/core/custom/index.d.ts
vendored
@@ -1,3 +1,4 @@
|
||||
import { FastDictFactory } from '@/common/utils';
|
||||
type CustomHandleType = 'add' | 'replace';
|
||||
type CustomDictType = 'pinyin' | 'multiple' | 'polyphonic';
|
||||
interface CustomPinyinOptions {
|
||||
@@ -16,9 +17,9 @@ interface CustomPinyinOptions {
|
||||
* @param {CustomPinyinOptions} options multiple/polyphonic 对于 customPinyin 补充词汇的处理
|
||||
*/
|
||||
export declare function customPinyin(config?: {
|
||||
[key: string]: string;
|
||||
[word: string]: string;
|
||||
}, options?: CustomPinyinOptions): void;
|
||||
export declare const getCustomMultpileDict: () => string[];
|
||||
export declare const getCustomPolyphonicDict: () => string[];
|
||||
export declare const getCustomMultpileDict: () => FastDictFactory;
|
||||
export declare const getCustomPolyphonicDict: () => FastDictFactory;
|
||||
export declare function clearCustomDict(dict: CustomDictType | CustomDictType[]): void;
|
||||
export {};
|
||||
|
||||
8
types/core/pinyin/handle.d.ts
vendored
8
types/core/pinyin/handle.d.ts
vendored
@@ -3,10 +3,10 @@ import type { SurnameMode } from "../../common/type";
|
||||
import { TokenizationAlgorithm } from "../../common/segmentit";
|
||||
/**
|
||||
* @description: 获取单个字符的拼音
|
||||
* @param {string} word
|
||||
* @param {string} char
|
||||
* @return {string}
|
||||
*/
|
||||
type GetSingleWordPinyin = (word: string) => string;
|
||||
type GetSingleWordPinyin = (char: string) => string;
|
||||
export declare const getSingleWordPinyin: GetSingleWordPinyin;
|
||||
export declare const getPinyin: (word: string, list: SingleWordResult[], surname: SurnameMode, segmentit: TokenizationAlgorithm) => SingleWordResult[];
|
||||
/**
|
||||
@@ -18,10 +18,10 @@ type GetPinyinWithoutTone = (pinyin: string) => string;
|
||||
declare const getPinyinWithoutTone: GetPinyinWithoutTone;
|
||||
/**
|
||||
* @description: 获取单字符的多音拼音
|
||||
* @param {string} word
|
||||
* @param {string} char
|
||||
* @return {WordResult[]}
|
||||
*/
|
||||
type GetAllPinyin = (word: string, surname?: SurnameMode) => string[];
|
||||
type GetAllPinyin = (char: string, surname?: SurnameMode) => string[];
|
||||
export declare const getAllPinyin: GetAllPinyin;
|
||||
/**
|
||||
* @description: 获取单字符的多音拼音
|
||||
|
||||
1
types/core/pinyin/middlewares.d.ts
vendored
1
types/core/pinyin/middlewares.d.ts
vendored
@@ -20,5 +20,4 @@ export declare const middlewareType: (list: SingleWordResult[], options: Complet
|
||||
polyphonic: string[];
|
||||
inZhRange: boolean;
|
||||
}[];
|
||||
export declare const middlewareDoubleUnicode: (list: SingleWordResult[]) => SingleWordResult[];
|
||||
export declare const middlewareToneSandhi: (list: SingleWordResult[], toneSandhi: boolean) => SingleWordResult[];
|
||||
|
||||
3
types/data/dict1.d.ts
vendored
3
types/data/dict1.d.ts
vendored
@@ -1,2 +1,3 @@
|
||||
declare const DICT1: string[];
|
||||
import { FastDictFactory } from "@/common/utils";
|
||||
declare const DICT1: FastDictFactory;
|
||||
export default DICT1;
|
||||
|
||||
Reference in New Issue
Block a user