Merge pull request #242 from zh-lx/feature-2unicode-match-and-custom

feat: custom 和 match api 对于双 unicode 编码字符的适配
This commit is contained in:
zhoulixiang
2024-06-05 11:19:09 +08:00
committed by GitHub
9 changed files with 115 additions and 54 deletions

View File

@@ -1,7 +1,24 @@
import { DoubleUnicodeReg } from './constant';
import { DoubleUnicodePrefixReg, DoubleUnicodeSuffixReg, DoubleUnicodeReg } from './constant';
export function getStringLength(string: string) {
return string.replace(DoubleUnicodeReg, '_').length;
export function stringLength(text: string) {
return text.replace(DoubleUnicodeReg, '_').length;
}
// 双音节字符处理
export function splitString(text: string): string[] {
const result = [];
let i = 0;
while (i < text.length) {
const char = text.charAt(i);
if (DoubleUnicodePrefixReg.test(char) && DoubleUnicodeSuffixReg.test(text.charAt(i + 1))) {
result.push(text.substring(i, i + 2));
i += 2;
} else {
result.push(char);
i += 1;
}
}
return result;
}
export function isZhChar(char: string) {

View File

@@ -1,6 +1,6 @@
import { acTree } from '@/common/segmentit';
import { Probability, Priority } from '@/common/constant';
import { getStringLength } from '@/common/utils';
import { splitString, stringLength } from '@/common/utils';
import DICT1 from '@/data/dict1';
let customDict: { [key: string]: string } = {};
let customMultipleDict: string[] = [];
@@ -33,7 +33,7 @@ export function customPinyin(
options?: CustomPinyinOptions
) {
const keys = Object.keys(config).sort(
(key1, key2) => getStringLength(key2) - getStringLength(key1)
(key1, key2) => stringLength(key2) - stringLength(key1)
);
keys.forEach((key) => {
customDict[key] = config[key];
@@ -41,7 +41,7 @@ export function customPinyin(
const customPatterns = Object.keys(customDict).map((key) => ({
zh: key,
pinyin: customDict[key],
probability: Probability.Custom + getStringLength(key),
probability: Probability.Custom + stringLength(key),
length: key.length,
priority: Priority.Custom,
dict: CustomDictName,
@@ -63,10 +63,10 @@ function addCustomConfigToDict(
) {
for (let key in config) {
const pinyins = config[key];
key.split('').forEach((word, index) => {
splitString(key).forEach((word, index) => {
const pinyin = pinyins.split(' ')[index] || '';
const wordCode = word.charCodeAt(0);
if (handleType === 'replace') {
if (handleType === 'replace' || (handleType === 'add' && !dict[wordCode] && !DICT1[wordCode])) {
// 直接覆盖原词典
dict[wordCode] = pinyin;
} else {

View File

@@ -1,6 +1,6 @@
import { Priority, Probability } from "@/common/constant";
import { Pattern, acTree } from "@/common/segmentit";
import { getStringLength } from "@/common/utils";
import { stringLength } from "@/common/utils";
import DICT1 from "@/data/dict1";
const DefaultName = Symbol("default");
@@ -29,7 +29,7 @@ export function addDict(dict: DICT | {}, options?: string | DictOptions) {
for (let key in dict as DICT) {
const value = (dict as DICT)[key];
const pinyin = Array.isArray(value) ? value[0] : value;
if (getStringLength(key) === 1) {
if (stringLength(key) === 1) {
addToOriginDict(
dictName,
key,

View File

@@ -1,10 +1,11 @@
import { pinyin as _pinyin } from '@/core/pinyin';
import { splitString } from "@/common/utils";
import { pinyin as _pinyin } from "@/core/pinyin";
interface MatchOptions {
/**
* @description 每个汉字和拼音需要遵从的匹配精度
*/
precision?: 'first' | 'start' | 'every' | 'any';
precision?: "first" | "start" | "every" | "any";
/**
* @description 匹配的汉字下标是否为连续的才算匹配成功
*/
@@ -12,11 +13,11 @@ interface MatchOptions {
/**
* @description 匹配时对于空格的处理
*/
space?: 'ignore' | 'preserve';
space?: "ignore" | "preserve";
/**
* @description 最后一个字的匹配精度
*/
lastPrecision?: 'first' | 'start' | 'every' | 'any';
lastPrecision?: "first" | "start" | "every" | "any";
/**
* @description 是否大小写不敏感
*/
@@ -24,10 +25,10 @@ interface MatchOptions {
}
const DefaultMatchOptions: MatchOptions = {
precision: 'first',
precision: "first",
continuous: false,
space: 'ignore',
lastPrecision: 'start',
space: "ignore",
lastPrecision: "start",
insensitive: true,
};
@@ -41,8 +42,8 @@ const MAX_PINYIN_LENGTH = 6;
* @return {Array | null} 若匹配成功,返回 text 中匹配成功的下标数组;若匹配失败,返回 null
*/
export const match = (text: string, pinyin: string, options?: MatchOptions) => {
if (options?.precision === 'any') {
options.lastPrecision = 'any';
if (options?.precision === "any") {
options.lastPrecision = "any";
}
const completeOptions = {
...DefaultMatchOptions,
@@ -54,14 +55,14 @@ export const match = (text: string, pinyin: string, options?: MatchOptions) => {
pinyin = pinyin.toLowerCase();
}
// 移除空格
if (completeOptions.space === 'ignore') {
pinyin = pinyin.replace(/\s/g, '');
if (completeOptions.space === "ignore") {
pinyin = pinyin.replace(/\s/g, "");
}
const result =
options?.precision === 'any'
options?.precision === "any"
? matchAny(text, pinyin, completeOptions)
: matchAboveStart(text, pinyin, completeOptions);
return result;
return processDoubleUnicodeIndex(text, result);
};
// 检测两个拼音最大的匹配长度
@@ -81,23 +82,24 @@ const matchAny = (
options: Required<MatchOptions>
) => {
let result = [];
for (let i = 0; i < text.length; i++) {
const words = splitString(text);
for (let i = 0; i < words.length; i++) {
// 空格字符
if (options.space === 'ignore' && text[i] === ' ') {
if (options.space === "ignore" && words[i] === " ") {
result.push(i);
continue;
}
// 是否为中文匹配
if (text[i] === pinyin[0]) {
if (words[i] === pinyin[0]) {
pinyin = pinyin.slice(1);
result.push(i);
continue;
}
// 当前字的多音字拼音
const ps = _pinyin(text[i], {
toneType: 'none',
const ps = _pinyin(words[i], {
toneType: "none",
multiple: true,
type: 'array',
type: "array",
});
let currentLength = 0;
ps.forEach((p) => {
@@ -128,8 +130,8 @@ const matchAny = (
return null;
}
}
if (options.space === 'ignore') {
result = result.filter((i) => text[i] !== ' ');
if (options.space === "ignore") {
result = result.filter((i) => words[i] !== " ");
}
return result.length ? result : null;
};
@@ -139,7 +141,7 @@ const matchAboveStart = (
pinyin: string,
options: Required<MatchOptions>
) => {
const words = text.split('');
const words = splitString(text);
// 二维数组 dp[i][j]i 表示遍历到的 text 索引+1, j 表示遍历到的 pinyin 的索引+1
const dp = Array(words.length + 1);
@@ -157,7 +159,7 @@ const matchAboveStart = (
// options.continuous 为 false 或 options.space 为 ignore 且当前为空格时,第 i 个字可以不参与匹配
if (
!options.continuous ||
(options.space == 'ignore' && text[i - 1] === ' ')
(options.space == "ignore" && words[i - 1] === " ")
) {
for (let j = 1; j <= pinyin.length; j++) {
dp[i][j - 1] = dp[i - 1][j - 1];
@@ -172,14 +174,14 @@ const matchAboveStart = (
// 非开头且前面的字符未匹配完成,停止向后匹配
continue;
} else {
const muls = _pinyin(text[i - 1], {
type: 'array',
toneType: 'none',
const muls = _pinyin(words[i - 1], {
type: "array",
toneType: "none",
multiple: true,
});
// 非中文匹配
if (text[i - 1] === pinyin[j - 1]) {
if (words[i - 1] === pinyin[j - 1]) {
const matches = [...dp[i - 1][j - 1], i - 1];
// 记录最长的可匹配下标数组
if (!dp[i][j] || matches.length > dp[i][j].length) {
@@ -195,16 +197,16 @@ const matchAboveStart = (
if (pinyin.length - j <= MAX_PINYIN_LENGTH) {
// lastPrecision 参数处理
const last = muls.some((py) => {
if (options.lastPrecision === 'any') {
if (options.lastPrecision === "any") {
return py.includes(pinyin.slice(j - 1, pinyin.length));
}
if (options.lastPrecision === 'start') {
if (options.lastPrecision === "start") {
return py.startsWith(pinyin.slice(j - 1, pinyin.length));
}
if (options.lastPrecision === 'first') {
if (options.lastPrecision === "first") {
return py[0] === pinyin.slice(j - 1, pinyin.length);
}
if (options.lastPrecision === 'every') {
if (options.lastPrecision === "every") {
return py === pinyin.slice(j - 1, pinyin.length);
}
return false;
@@ -217,7 +219,7 @@ const matchAboveStart = (
const precision = options.precision;
// precision 为 start 时,匹配开头
if (precision === 'start') {
if (precision === "start") {
muls.forEach((py) => {
let end = j;
const matches = [...dp[i - 1][j - 1], i - 1];
@@ -234,7 +236,7 @@ const matchAboveStart = (
}
// precision 为 first 时,匹配首字母
if (precision === 'first') {
if (precision === "first") {
if (muls.some((py) => py[0] === pinyin[j - 1])) {
const matches = [...dp[i - 1][j - 1], i - 1];
// 记录最长的可匹配下标数组
@@ -261,3 +263,33 @@ const matchAboveStart = (
}
return null;
};
// 对于双字节的字符,需要将 index 顺延 +1
function processDoubleUnicodeIndex(
text: string,
indexArray: number[] | null
): number[] | null {
if (!indexArray) {
return null;
}
const result = [];
let doubleUnicodeCount = 0;
const words = splitString(text);
let i = 0;
for (let j = 0; j < indexArray.length; j++) {
const curIndex = indexArray[j];
while (i <= curIndex) {
if (words[i].length === 2) {
doubleUnicodeCount++;
}
i++;
}
const realIndex = curIndex + doubleUnicodeCount;
if (words[curIndex].length === 2) {
result.push(realIndex - 1, realIndex);
} else {
result.push(realIndex);
}
}
return result;
}

View File

@@ -1,4 +1,4 @@
import { getStringLength, isZhChar } from "@/common/utils";
import { stringLength, isZhChar } from "@/common/utils";
import type { SingleWordResult } from "../../common/type";
import {
DoubleUnicodePrefixReg,
@@ -59,7 +59,7 @@ export const middlewareMultiple = (
word: string,
options: CompleteOptions
): SingleWordResult[] | false => {
if (getStringLength(word) === 1 && options.multiple) {
if (stringLength(word) === 1 && options.multiple) {
return getMultiplePinyin(word, options.surname);
} else {
return false;
@@ -166,7 +166,7 @@ export const middlewareType = (
options: CompleteOptions,
word: string
) => {
if (options.multiple && getStringLength(word) === 1) {
if (options.multiple && stringLength(word) === 1) {
let last = "";
list = list.filter((item) => {
const res = item.result !== last;

View File

@@ -15,7 +15,7 @@ import {
getFinalParts,
} from '@/core/pinyin/handle';
import { getCustomPolyphonicDict } from '../custom';
import { isZhChar } from '@/common/utils';
import { isZhChar, splitString } from '@/common/utils';
interface BasicOptions {
/**
@@ -206,7 +206,7 @@ function polyphonic(
// 获取每个字多音字的数组
const getPolyphonicList = (text: string): SingleWordResult[] => {
return text.split('').map((word) => {
return splitString(text).map((word) => {
const wordCode = word.charCodeAt(0);
const customPolyphonicDict = getCustomPolyphonicDict();
const pinyin = customPolyphonicDict[wordCode] || DICT1[wordCode] || word;

View File

@@ -1,4 +1,4 @@
import { match } from '../lib/index';
import { match, customPinyin, clearCustomDict } from '../lib/index';
import { expect, describe, it } from 'vitest';
describe('match', () => {
@@ -90,6 +90,17 @@ describe('match', () => {
expect(result).to.deep.equal([2, 4]);
});
it('[match]first&space', () => {
customPinyin({
𧒽: 'lei'
}, {
multiple: 'replace'
})
const result = match('𧒽测 试', 'l c s');
expect(result).to.deep.equal([0, 1, 2, 4]);
clearCustomDict(['pinyin', 'multiple', 'polyphonic']);
});
it('[match]nonZh match', () => {
const result = match('测uuuuuuuuuu试', 'cuuuuuu');
expect(result).to.deep.equal([0, 1, 2, 3, 4, 5, 6]);

View File

@@ -1,2 +1,3 @@
export declare function getStringLength(string: string): number;
export declare function stringLength(text: string): number;
export declare function splitString(text: string): string[];
export declare function isZhChar(char: string): boolean;

View File

@@ -2,7 +2,7 @@ interface MatchOptions {
/**
* @description 每个汉字和拼音需要遵从的匹配精度
*/
precision?: 'first' | 'start' | 'every' | 'any';
precision?: "first" | "start" | "every" | "any";
/**
* @description 匹配的汉字下标是否为连续的才算匹配成功
*/
@@ -10,11 +10,11 @@ interface MatchOptions {
/**
* @description 匹配时对于空格的处理
*/
space?: 'ignore' | 'preserve';
space?: "ignore" | "preserve";
/**
* @description 最后一个字的匹配精度
*/
lastPrecision?: 'first' | 'start' | 'every' | 'any';
lastPrecision?: "first" | "start" | "every" | "any";
/**
* @description 是否大小写不敏感
*/
@@ -27,5 +27,5 @@ interface MatchOptions {
* @param {MatchOptions=} options 配置项
* @return {Array | null} 若匹配成功,返回 text 中匹配成功的下标数组;若匹配失败,返回 null
*/
export declare const match: (text: string, pinyin: string, options?: MatchOptions) => any;
export declare const match: (text: string, pinyin: string, options?: MatchOptions) => number[] | null;
export {};