mirror of
https://github.com/zh-lx/pinyin-pro.git
synced 2026-03-13 09:51:38 +08:00
Merge pull request #242 from zh-lx/feature-2unicode-match-and-custom
feat: custom 和 match api 对于双 unicode 编码字符的适配
This commit is contained in:
@@ -1,7 +1,24 @@
|
||||
import { DoubleUnicodeReg } from './constant';
|
||||
import { DoubleUnicodePrefixReg, DoubleUnicodeSuffixReg, DoubleUnicodeReg } from './constant';
|
||||
|
||||
export function getStringLength(string: string) {
|
||||
return string.replace(DoubleUnicodeReg, '_').length;
|
||||
export function stringLength(text: string) {
|
||||
return text.replace(DoubleUnicodeReg, '_').length;
|
||||
}
|
||||
|
||||
// 双音节字符处理
|
||||
export function splitString(text: string): string[] {
|
||||
const result = [];
|
||||
let i = 0;
|
||||
while (i < text.length) {
|
||||
const char = text.charAt(i);
|
||||
if (DoubleUnicodePrefixReg.test(char) && DoubleUnicodeSuffixReg.test(text.charAt(i + 1))) {
|
||||
result.push(text.substring(i, i + 2));
|
||||
i += 2;
|
||||
} else {
|
||||
result.push(char);
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
export function isZhChar(char: string) {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import { acTree } from '@/common/segmentit';
|
||||
import { Probability, Priority } from '@/common/constant';
|
||||
import { getStringLength } from '@/common/utils';
|
||||
import { splitString, stringLength } from '@/common/utils';
|
||||
import DICT1 from '@/data/dict1';
|
||||
let customDict: { [key: string]: string } = {};
|
||||
let customMultipleDict: string[] = [];
|
||||
@@ -33,7 +33,7 @@ export function customPinyin(
|
||||
options?: CustomPinyinOptions
|
||||
) {
|
||||
const keys = Object.keys(config).sort(
|
||||
(key1, key2) => getStringLength(key2) - getStringLength(key1)
|
||||
(key1, key2) => stringLength(key2) - stringLength(key1)
|
||||
);
|
||||
keys.forEach((key) => {
|
||||
customDict[key] = config[key];
|
||||
@@ -41,7 +41,7 @@ export function customPinyin(
|
||||
const customPatterns = Object.keys(customDict).map((key) => ({
|
||||
zh: key,
|
||||
pinyin: customDict[key],
|
||||
probability: Probability.Custom + getStringLength(key),
|
||||
probability: Probability.Custom + stringLength(key),
|
||||
length: key.length,
|
||||
priority: Priority.Custom,
|
||||
dict: CustomDictName,
|
||||
@@ -63,10 +63,10 @@ function addCustomConfigToDict(
|
||||
) {
|
||||
for (let key in config) {
|
||||
const pinyins = config[key];
|
||||
key.split('').forEach((word, index) => {
|
||||
splitString(key).forEach((word, index) => {
|
||||
const pinyin = pinyins.split(' ')[index] || '';
|
||||
const wordCode = word.charCodeAt(0);
|
||||
if (handleType === 'replace') {
|
||||
if (handleType === 'replace' || (handleType === 'add' && !dict[wordCode] && !DICT1[wordCode])) {
|
||||
// 直接覆盖原词典
|
||||
dict[wordCode] = pinyin;
|
||||
} else {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import { Priority, Probability } from "@/common/constant";
|
||||
import { Pattern, acTree } from "@/common/segmentit";
|
||||
import { getStringLength } from "@/common/utils";
|
||||
import { stringLength } from "@/common/utils";
|
||||
import DICT1 from "@/data/dict1";
|
||||
|
||||
const DefaultName = Symbol("default");
|
||||
@@ -29,7 +29,7 @@ export function addDict(dict: DICT | {}, options?: string | DictOptions) {
|
||||
for (let key in dict as DICT) {
|
||||
const value = (dict as DICT)[key];
|
||||
const pinyin = Array.isArray(value) ? value[0] : value;
|
||||
if (getStringLength(key) === 1) {
|
||||
if (stringLength(key) === 1) {
|
||||
addToOriginDict(
|
||||
dictName,
|
||||
key,
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import { pinyin as _pinyin } from '@/core/pinyin';
|
||||
import { splitString } from "@/common/utils";
|
||||
import { pinyin as _pinyin } from "@/core/pinyin";
|
||||
|
||||
interface MatchOptions {
|
||||
/**
|
||||
* @description 每个汉字和拼音需要遵从的匹配精度
|
||||
*/
|
||||
precision?: 'first' | 'start' | 'every' | 'any';
|
||||
precision?: "first" | "start" | "every" | "any";
|
||||
/**
|
||||
* @description 匹配的汉字下标是否为连续的才算匹配成功
|
||||
*/
|
||||
@@ -12,11 +13,11 @@ interface MatchOptions {
|
||||
/**
|
||||
* @description 匹配时对于空格的处理
|
||||
*/
|
||||
space?: 'ignore' | 'preserve';
|
||||
space?: "ignore" | "preserve";
|
||||
/**
|
||||
* @description 最后一个字的匹配精度
|
||||
*/
|
||||
lastPrecision?: 'first' | 'start' | 'every' | 'any';
|
||||
lastPrecision?: "first" | "start" | "every" | "any";
|
||||
/**
|
||||
* @description 是否大小写不敏感
|
||||
*/
|
||||
@@ -24,10 +25,10 @@ interface MatchOptions {
|
||||
}
|
||||
|
||||
const DefaultMatchOptions: MatchOptions = {
|
||||
precision: 'first',
|
||||
precision: "first",
|
||||
continuous: false,
|
||||
space: 'ignore',
|
||||
lastPrecision: 'start',
|
||||
space: "ignore",
|
||||
lastPrecision: "start",
|
||||
insensitive: true,
|
||||
};
|
||||
|
||||
@@ -41,8 +42,8 @@ const MAX_PINYIN_LENGTH = 6;
|
||||
* @return {Array | null} 若匹配成功,返回 text 中匹配成功的下标数组;若匹配失败,返回 null
|
||||
*/
|
||||
export const match = (text: string, pinyin: string, options?: MatchOptions) => {
|
||||
if (options?.precision === 'any') {
|
||||
options.lastPrecision = 'any';
|
||||
if (options?.precision === "any") {
|
||||
options.lastPrecision = "any";
|
||||
}
|
||||
const completeOptions = {
|
||||
...DefaultMatchOptions,
|
||||
@@ -54,14 +55,14 @@ export const match = (text: string, pinyin: string, options?: MatchOptions) => {
|
||||
pinyin = pinyin.toLowerCase();
|
||||
}
|
||||
// 移除空格
|
||||
if (completeOptions.space === 'ignore') {
|
||||
pinyin = pinyin.replace(/\s/g, '');
|
||||
if (completeOptions.space === "ignore") {
|
||||
pinyin = pinyin.replace(/\s/g, "");
|
||||
}
|
||||
const result =
|
||||
options?.precision === 'any'
|
||||
options?.precision === "any"
|
||||
? matchAny(text, pinyin, completeOptions)
|
||||
: matchAboveStart(text, pinyin, completeOptions);
|
||||
return result;
|
||||
return processDoubleUnicodeIndex(text, result);
|
||||
};
|
||||
|
||||
// 检测两个拼音最大的匹配长度
|
||||
@@ -81,23 +82,24 @@ const matchAny = (
|
||||
options: Required<MatchOptions>
|
||||
) => {
|
||||
let result = [];
|
||||
for (let i = 0; i < text.length; i++) {
|
||||
const words = splitString(text);
|
||||
for (let i = 0; i < words.length; i++) {
|
||||
// 空格字符
|
||||
if (options.space === 'ignore' && text[i] === ' ') {
|
||||
if (options.space === "ignore" && words[i] === " ") {
|
||||
result.push(i);
|
||||
continue;
|
||||
}
|
||||
// 是否为中文匹配
|
||||
if (text[i] === pinyin[0]) {
|
||||
if (words[i] === pinyin[0]) {
|
||||
pinyin = pinyin.slice(1);
|
||||
result.push(i);
|
||||
continue;
|
||||
}
|
||||
// 当前字的多音字拼音
|
||||
const ps = _pinyin(text[i], {
|
||||
toneType: 'none',
|
||||
const ps = _pinyin(words[i], {
|
||||
toneType: "none",
|
||||
multiple: true,
|
||||
type: 'array',
|
||||
type: "array",
|
||||
});
|
||||
let currentLength = 0;
|
||||
ps.forEach((p) => {
|
||||
@@ -128,8 +130,8 @@ const matchAny = (
|
||||
return null;
|
||||
}
|
||||
}
|
||||
if (options.space === 'ignore') {
|
||||
result = result.filter((i) => text[i] !== ' ');
|
||||
if (options.space === "ignore") {
|
||||
result = result.filter((i) => words[i] !== " ");
|
||||
}
|
||||
return result.length ? result : null;
|
||||
};
|
||||
@@ -139,7 +141,7 @@ const matchAboveStart = (
|
||||
pinyin: string,
|
||||
options: Required<MatchOptions>
|
||||
) => {
|
||||
const words = text.split('');
|
||||
const words = splitString(text);
|
||||
|
||||
// 二维数组 dp[i][j],i 表示遍历到的 text 索引+1, j 表示遍历到的 pinyin 的索引+1
|
||||
const dp = Array(words.length + 1);
|
||||
@@ -157,7 +159,7 @@ const matchAboveStart = (
|
||||
// options.continuous 为 false 或 options.space 为 ignore 且当前为空格时,第 i 个字可以不参与匹配
|
||||
if (
|
||||
!options.continuous ||
|
||||
(options.space == 'ignore' && text[i - 1] === ' ')
|
||||
(options.space == "ignore" && words[i - 1] === " ")
|
||||
) {
|
||||
for (let j = 1; j <= pinyin.length; j++) {
|
||||
dp[i][j - 1] = dp[i - 1][j - 1];
|
||||
@@ -172,14 +174,14 @@ const matchAboveStart = (
|
||||
// 非开头且前面的字符未匹配完成,停止向后匹配
|
||||
continue;
|
||||
} else {
|
||||
const muls = _pinyin(text[i - 1], {
|
||||
type: 'array',
|
||||
toneType: 'none',
|
||||
const muls = _pinyin(words[i - 1], {
|
||||
type: "array",
|
||||
toneType: "none",
|
||||
multiple: true,
|
||||
});
|
||||
|
||||
// 非中文匹配
|
||||
if (text[i - 1] === pinyin[j - 1]) {
|
||||
if (words[i - 1] === pinyin[j - 1]) {
|
||||
const matches = [...dp[i - 1][j - 1], i - 1];
|
||||
// 记录最长的可匹配下标数组
|
||||
if (!dp[i][j] || matches.length > dp[i][j].length) {
|
||||
@@ -195,16 +197,16 @@ const matchAboveStart = (
|
||||
if (pinyin.length - j <= MAX_PINYIN_LENGTH) {
|
||||
// lastPrecision 参数处理
|
||||
const last = muls.some((py) => {
|
||||
if (options.lastPrecision === 'any') {
|
||||
if (options.lastPrecision === "any") {
|
||||
return py.includes(pinyin.slice(j - 1, pinyin.length));
|
||||
}
|
||||
if (options.lastPrecision === 'start') {
|
||||
if (options.lastPrecision === "start") {
|
||||
return py.startsWith(pinyin.slice(j - 1, pinyin.length));
|
||||
}
|
||||
if (options.lastPrecision === 'first') {
|
||||
if (options.lastPrecision === "first") {
|
||||
return py[0] === pinyin.slice(j - 1, pinyin.length);
|
||||
}
|
||||
if (options.lastPrecision === 'every') {
|
||||
if (options.lastPrecision === "every") {
|
||||
return py === pinyin.slice(j - 1, pinyin.length);
|
||||
}
|
||||
return false;
|
||||
@@ -217,7 +219,7 @@ const matchAboveStart = (
|
||||
const precision = options.precision;
|
||||
|
||||
// precision 为 start 时,匹配开头
|
||||
if (precision === 'start') {
|
||||
if (precision === "start") {
|
||||
muls.forEach((py) => {
|
||||
let end = j;
|
||||
const matches = [...dp[i - 1][j - 1], i - 1];
|
||||
@@ -234,7 +236,7 @@ const matchAboveStart = (
|
||||
}
|
||||
|
||||
// precision 为 first 时,匹配首字母
|
||||
if (precision === 'first') {
|
||||
if (precision === "first") {
|
||||
if (muls.some((py) => py[0] === pinyin[j - 1])) {
|
||||
const matches = [...dp[i - 1][j - 1], i - 1];
|
||||
// 记录最长的可匹配下标数组
|
||||
@@ -261,3 +263,33 @@ const matchAboveStart = (
|
||||
}
|
||||
return null;
|
||||
};
|
||||
|
||||
// 对于双字节的字符,需要将 index 顺延 +1
|
||||
function processDoubleUnicodeIndex(
|
||||
text: string,
|
||||
indexArray: number[] | null
|
||||
): number[] | null {
|
||||
if (!indexArray) {
|
||||
return null;
|
||||
}
|
||||
const result = [];
|
||||
let doubleUnicodeCount = 0;
|
||||
const words = splitString(text);
|
||||
let i = 0;
|
||||
for (let j = 0; j < indexArray.length; j++) {
|
||||
const curIndex = indexArray[j];
|
||||
while (i <= curIndex) {
|
||||
if (words[i].length === 2) {
|
||||
doubleUnicodeCount++;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
const realIndex = curIndex + doubleUnicodeCount;
|
||||
if (words[curIndex].length === 2) {
|
||||
result.push(realIndex - 1, realIndex);
|
||||
} else {
|
||||
result.push(realIndex);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { getStringLength, isZhChar } from "@/common/utils";
|
||||
import { stringLength, isZhChar } from "@/common/utils";
|
||||
import type { SingleWordResult } from "../../common/type";
|
||||
import {
|
||||
DoubleUnicodePrefixReg,
|
||||
@@ -59,7 +59,7 @@ export const middlewareMultiple = (
|
||||
word: string,
|
||||
options: CompleteOptions
|
||||
): SingleWordResult[] | false => {
|
||||
if (getStringLength(word) === 1 && options.multiple) {
|
||||
if (stringLength(word) === 1 && options.multiple) {
|
||||
return getMultiplePinyin(word, options.surname);
|
||||
} else {
|
||||
return false;
|
||||
@@ -166,7 +166,7 @@ export const middlewareType = (
|
||||
options: CompleteOptions,
|
||||
word: string
|
||||
) => {
|
||||
if (options.multiple && getStringLength(word) === 1) {
|
||||
if (options.multiple && stringLength(word) === 1) {
|
||||
let last = "";
|
||||
list = list.filter((item) => {
|
||||
const res = item.result !== last;
|
||||
|
||||
@@ -15,7 +15,7 @@ import {
|
||||
getFinalParts,
|
||||
} from '@/core/pinyin/handle';
|
||||
import { getCustomPolyphonicDict } from '../custom';
|
||||
import { isZhChar } from '@/common/utils';
|
||||
import { isZhChar, splitString } from '@/common/utils';
|
||||
|
||||
interface BasicOptions {
|
||||
/**
|
||||
@@ -206,7 +206,7 @@ function polyphonic(
|
||||
|
||||
// 获取每个字多音字的数组
|
||||
const getPolyphonicList = (text: string): SingleWordResult[] => {
|
||||
return text.split('').map((word) => {
|
||||
return splitString(text).map((word) => {
|
||||
const wordCode = word.charCodeAt(0);
|
||||
const customPolyphonicDict = getCustomPolyphonicDict();
|
||||
const pinyin = customPolyphonicDict[wordCode] || DICT1[wordCode] || word;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { match } from '../lib/index';
|
||||
import { match, customPinyin, clearCustomDict } from '../lib/index';
|
||||
import { expect, describe, it } from 'vitest';
|
||||
|
||||
describe('match', () => {
|
||||
@@ -90,6 +90,17 @@ describe('match', () => {
|
||||
expect(result).to.deep.equal([2, 4]);
|
||||
});
|
||||
|
||||
it('[match]first&space', () => {
|
||||
customPinyin({
|
||||
𧒽: 'lei'
|
||||
}, {
|
||||
multiple: 'replace'
|
||||
})
|
||||
const result = match('𧒽测 试', 'l c s');
|
||||
expect(result).to.deep.equal([0, 1, 2, 4]);
|
||||
clearCustomDict(['pinyin', 'multiple', 'polyphonic']);
|
||||
});
|
||||
|
||||
it('[match]nonZh match', () => {
|
||||
const result = match('测uuuuuuuuuu试', 'cuuuuuu');
|
||||
expect(result).to.deep.equal([0, 1, 2, 3, 4, 5, 6]);
|
||||
|
||||
3
types/common/utils.d.ts
vendored
3
types/common/utils.d.ts
vendored
@@ -1,2 +1,3 @@
|
||||
export declare function getStringLength(string: string): number;
|
||||
export declare function stringLength(text: string): number;
|
||||
export declare function splitString(text: string): string[];
|
||||
export declare function isZhChar(char: string): boolean;
|
||||
|
||||
8
types/core/match/index.d.ts
vendored
8
types/core/match/index.d.ts
vendored
@@ -2,7 +2,7 @@ interface MatchOptions {
|
||||
/**
|
||||
* @description 每个汉字和拼音需要遵从的匹配精度
|
||||
*/
|
||||
precision?: 'first' | 'start' | 'every' | 'any';
|
||||
precision?: "first" | "start" | "every" | "any";
|
||||
/**
|
||||
* @description 匹配的汉字下标是否为连续的才算匹配成功
|
||||
*/
|
||||
@@ -10,11 +10,11 @@ interface MatchOptions {
|
||||
/**
|
||||
* @description 匹配时对于空格的处理
|
||||
*/
|
||||
space?: 'ignore' | 'preserve';
|
||||
space?: "ignore" | "preserve";
|
||||
/**
|
||||
* @description 最后一个字的匹配精度
|
||||
*/
|
||||
lastPrecision?: 'first' | 'start' | 'every' | 'any';
|
||||
lastPrecision?: "first" | "start" | "every" | "any";
|
||||
/**
|
||||
* @description 是否大小写不敏感
|
||||
*/
|
||||
@@ -27,5 +27,5 @@ interface MatchOptions {
|
||||
* @param {MatchOptions=} options 配置项
|
||||
* @return {Array | null} 若匹配成功,返回 text 中匹配成功的下标数组;若匹配失败,返回 null
|
||||
*/
|
||||
export declare const match: (text: string, pinyin: string, options?: MatchOptions) => any;
|
||||
export declare const match: (text: string, pinyin: string, options?: MatchOptions) => number[] | null;
|
||||
export {};
|
||||
|
||||
Reference in New Issue
Block a user