Merge pull request #242 from zh-lx/feature-2unicode-match-and-custom

feat: custom 和 match api 对于双 unicode 编码字符的适配
2026-03-13 09:51:38 +08:00 · 2024-06-05 11:19:09 +08:00
parent 323175ebb5 7ea51e3ec0
commit 38306a5a0e
9 changed files with 115 additions and 54 deletions
--- a/lib/common/utils.ts
+++ b/lib/common/utils.ts
@@ -1,7 +1,24 @@
-import { DoubleUnicodeReg } from './constant';
+import { DoubleUnicodePrefixReg, DoubleUnicodeSuffixReg, DoubleUnicodeReg } from './constant';

-export function getStringLength(string: string) {
-  return string.replace(DoubleUnicodeReg, '_').length;
+export function stringLength(text: string) {
+  return text.replace(DoubleUnicodeReg, '_').length;
+}
+
+// 双音节字符处理
+export function splitString(text: string): string[] {
+  const result = [];
+  let i = 0;
+  while (i < text.length) {
+    const char = text.charAt(i);
+    if (DoubleUnicodePrefixReg.test(char) && DoubleUnicodeSuffixReg.test(text.charAt(i + 1))) {
+      result.push(text.substring(i, i + 2));
+      i += 2;
+    } else {
+      result.push(char);
+      i += 1;
+    }
+  }
+  return result;
 }

 export function isZhChar(char: string) {
--- a/lib/core/custom/index.ts
+++ b/lib/core/custom/index.ts
@@ -1,6 +1,6 @@
 import { acTree } from '@/common/segmentit';
 import { Probability, Priority } from '@/common/constant';
-import { getStringLength } from '@/common/utils';
+import { splitString, stringLength } from '@/common/utils';
 import DICT1 from '@/data/dict1';
 let customDict: { [key: string]: string } = {};
 let customMultipleDict: string[] = [];
@@ -33,7 +33,7 @@ export function customPinyin(
  options?: CustomPinyinOptions
 ) {
  const keys = Object.keys(config).sort(
-    (key1, key2) => getStringLength(key2) - getStringLength(key1)
+    (key1, key2) => stringLength(key2) - stringLength(key1)
  );
  keys.forEach((key) => {
    customDict[key] = config[key];
@@ -41,7 +41,7 @@ export function customPinyin(
  const customPatterns = Object.keys(customDict).map((key) => ({
    zh: key,
    pinyin: customDict[key],
-    probability: Probability.Custom + getStringLength(key),
+    probability: Probability.Custom + stringLength(key),
    length: key.length,
    priority: Priority.Custom,
    dict: CustomDictName,
@@ -63,10 +63,10 @@ function addCustomConfigToDict(
 ) {
  for (let key in config) {
    const pinyins = config[key];
-    key.split('').forEach((word, index) => {
+    splitString(key).forEach((word, index) => {
      const pinyin = pinyins.split(' ')[index] || '';
      const wordCode = word.charCodeAt(0);
-      if (handleType === 'replace') {
+      if (handleType === 'replace' || (handleType === 'add' && !dict[wordCode] && !DICT1[wordCode])) {
        // 直接覆盖原词典
        dict[wordCode] = pinyin;
      } else {
--- a/lib/core/dict/index.ts
+++ b/lib/core/dict/index.ts
@@ -1,6 +1,6 @@
 import { Priority, Probability } from "@/common/constant";
 import { Pattern, acTree } from "@/common/segmentit";
-import { getStringLength } from "@/common/utils";
+import { stringLength } from "@/common/utils";
 import DICT1 from "@/data/dict1";

 const DefaultName = Symbol("default");
@@ -29,7 +29,7 @@ export function addDict(dict: DICT | {}, options?: string | DictOptions) {
  for (let key in dict as DICT) {
    const value = (dict as DICT)[key];
    const pinyin = Array.isArray(value) ? value[0] : value;
-    if (getStringLength(key) === 1) {
+    if (stringLength(key) === 1) {
      addToOriginDict(
        dictName,
        key,
--- a/lib/core/match/index.ts
+++ b/lib/core/match/index.ts
@@ -1,10 +1,11 @@
-import { pinyin as _pinyin } from '@/core/pinyin';
+import { splitString } from "@/common/utils";
+import { pinyin as _pinyin } from "@/core/pinyin";

 interface MatchOptions {
  /**
   * @description 每个汉字和拼音需要遵从的匹配精度
   */
-  precision?: 'first' | 'start' | 'every' | 'any';
+  precision?: "first" | "start" | "every" | "any";
  /**
   * @description 匹配的汉字下标是否为连续的才算匹配成功
   */
@@ -12,11 +13,11 @@ interface MatchOptions {
  /**
   * @description 匹配时对于空格的处理
   */
-  space?: 'ignore' | 'preserve';
+  space?: "ignore" | "preserve";
  /**
   * @description 最后一个字的匹配精度
   */
-  lastPrecision?: 'first' | 'start' | 'every' | 'any';
+  lastPrecision?: "first" | "start" | "every" | "any";
  /**
   * @description 是否大小写不敏感
   */
@@ -24,10 +25,10 @@ interface MatchOptions {
 }

 const DefaultMatchOptions: MatchOptions = {
-  precision: 'first',
+  precision: "first",
  continuous: false,
-  space: 'ignore',
-  lastPrecision: 'start',
+  space: "ignore",
+  lastPrecision: "start",
  insensitive: true,
 };

@@ -41,8 +42,8 @@ const MAX_PINYIN_LENGTH = 6;
 * @return {Array | null} 若匹配成功，返回 text 中匹配成功的下标数组；若匹配失败，返回 null
 */
 export const match = (text: string, pinyin: string, options?: MatchOptions) => {
-  if (options?.precision === 'any') {
-    options.lastPrecision = 'any';
+  if (options?.precision === "any") {
+    options.lastPrecision = "any";
  }
  const completeOptions = {
    ...DefaultMatchOptions,
@@ -54,14 +55,14 @@ export const match = (text: string, pinyin: string, options?: MatchOptions) => {
    pinyin = pinyin.toLowerCase();
  }
  // 移除空格
-  if (completeOptions.space === 'ignore') {
-    pinyin = pinyin.replace(/\s/g, '');
+  if (completeOptions.space === "ignore") {
+    pinyin = pinyin.replace(/\s/g, "");
  }
  const result =
-    options?.precision === 'any'
+    options?.precision === "any"
      ? matchAny(text, pinyin, completeOptions)
      : matchAboveStart(text, pinyin, completeOptions);
-  return result;
+  return processDoubleUnicodeIndex(text, result);
 };

 // 检测两个拼音最大的匹配长度
@@ -81,23 +82,24 @@ const matchAny = (
  options: Required<MatchOptions>
 ) => {
  let result = [];
-  for (let i = 0; i < text.length; i++) {
+  const words = splitString(text);
+  for (let i = 0; i < words.length; i++) {
    // 空格字符
-    if (options.space === 'ignore' && text[i] === ' ') {
+    if (options.space === "ignore" && words[i] === " ") {
      result.push(i);
      continue;
    }
    // 是否为中文匹配
-    if (text[i] === pinyin[0]) {
+    if (words[i] === pinyin[0]) {
      pinyin = pinyin.slice(1);
      result.push(i);
      continue;
    }
    // 当前字的多音字拼音
-    const ps = _pinyin(text[i], {
-      toneType: 'none',
+    const ps = _pinyin(words[i], {
+      toneType: "none",
      multiple: true,
-      type: 'array',
+      type: "array",
    });
    let currentLength = 0;
    ps.forEach((p) => {
@@ -128,8 +130,8 @@ const matchAny = (
      return null;
    }
  }
-  if (options.space === 'ignore') {
-    result = result.filter((i) => text[i] !== ' ');
+  if (options.space === "ignore") {
+    result = result.filter((i) => words[i] !== " ");
  }
  return result.length ? result : null;
 };
@@ -139,7 +141,7 @@ const matchAboveStart = (
  pinyin: string,
  options: Required<MatchOptions>
 ) => {
-  const words = text.split('');
+  const words = splitString(text);

  // 二维数组 dp[i][j]，i 表示遍历到的 text 索引+1, j 表示遍历到的 pinyin 的索引+1
  const dp = Array(words.length + 1);
@@ -157,7 +159,7 @@ const matchAboveStart = (
    // options.continuous 为 false 或 options.space 为 ignore 且当前为空格时，第 i 个字可以不参与匹配
    if (
      !options.continuous ||
-      (options.space == 'ignore' && text[i - 1] === ' ')
+      (options.space == "ignore" && words[i - 1] === " ")
    ) {
      for (let j = 1; j <= pinyin.length; j++) {
        dp[i][j - 1] = dp[i - 1][j - 1];
@@ -172,14 +174,14 @@ const matchAboveStart = (
        // 非开头且前面的字符未匹配完成，停止向后匹配
        continue;
      } else {
-        const muls = _pinyin(text[i - 1], {
-          type: 'array',
-          toneType: 'none',
+        const muls = _pinyin(words[i - 1], {
+          type: "array",
+          toneType: "none",
          multiple: true,
        });

        // 非中文匹配
-        if (text[i - 1] === pinyin[j - 1]) {
+        if (words[i - 1] === pinyin[j - 1]) {
          const matches = [...dp[i - 1][j - 1], i - 1];
          // 记录最长的可匹配下标数组
          if (!dp[i][j] || matches.length > dp[i][j].length) {
@@ -195,16 +197,16 @@ const matchAboveStart = (
        if (pinyin.length - j <= MAX_PINYIN_LENGTH) {
          // lastPrecision 参数处理
          const last = muls.some((py) => {
-            if (options.lastPrecision === 'any') {
+            if (options.lastPrecision === "any") {
              return py.includes(pinyin.slice(j - 1, pinyin.length));
            }
-            if (options.lastPrecision === 'start') {
+            if (options.lastPrecision === "start") {
              return py.startsWith(pinyin.slice(j - 1, pinyin.length));
            }
-            if (options.lastPrecision === 'first') {
+            if (options.lastPrecision === "first") {
              return py[0] === pinyin.slice(j - 1, pinyin.length);
            }
-            if (options.lastPrecision === 'every') {
+            if (options.lastPrecision === "every") {
              return py === pinyin.slice(j - 1, pinyin.length);
            }
            return false;
@@ -217,7 +219,7 @@ const matchAboveStart = (
        const precision = options.precision;

        // precision 为 start 时，匹配开头
-        if (precision === 'start') {
+        if (precision === "start") {
          muls.forEach((py) => {
            let end = j;
            const matches = [...dp[i - 1][j - 1], i - 1];
@@ -234,7 +236,7 @@ const matchAboveStart = (
        }

        // precision 为 first 时，匹配首字母
-        if (precision === 'first') {
+        if (precision === "first") {
          if (muls.some((py) => py[0] === pinyin[j - 1])) {
            const matches = [...dp[i - 1][j - 1], i - 1];
            // 记录最长的可匹配下标数组
@@ -261,3 +263,33 @@ const matchAboveStart = (
  }
  return null;
 };
+
+// 对于双字节的字符，需要将 index 顺延 +1
+function processDoubleUnicodeIndex(
+  text: string,
+  indexArray: number[] | null
+): number[] | null {
+  if (!indexArray) {
+    return null;
+  }
+  const result = [];
+  let doubleUnicodeCount = 0;
+  const words = splitString(text);
+  let i = 0;
+  for (let j = 0; j < indexArray.length; j++) {
+    const curIndex = indexArray[j];
+    while (i <= curIndex) {
+      if (words[i].length === 2) {
+        doubleUnicodeCount++;
+      }
+      i++;
+    }
+    const realIndex = curIndex + doubleUnicodeCount;
+    if (words[curIndex].length === 2) {
+      result.push(realIndex - 1, realIndex);
+    } else {
+      result.push(realIndex);
+    }
+  }
+  return result;
+}
--- a/lib/core/pinyin/middlewares.ts
+++ b/lib/core/pinyin/middlewares.ts
@@ -1,4 +1,4 @@
-import { getStringLength, isZhChar } from "@/common/utils";
+import { stringLength, isZhChar } from "@/common/utils";
 import type { SingleWordResult } from "../../common/type";
 import {
  DoubleUnicodePrefixReg,
@@ -59,7 +59,7 @@ export const middlewareMultiple = (
  word: string,
  options: CompleteOptions
 ): SingleWordResult[] | false => {
-  if (getStringLength(word) === 1 && options.multiple) {
+  if (stringLength(word) === 1 && options.multiple) {
    return getMultiplePinyin(word, options.surname);
  } else {
    return false;
@@ -166,7 +166,7 @@ export const middlewareType = (
  options: CompleteOptions,
  word: string
 ) => {
-  if (options.multiple && getStringLength(word) === 1) {
+  if (options.multiple && stringLength(word) === 1) {
    let last = "";
    list = list.filter((item) => {
      const res = item.result !== last;
--- a/lib/core/polyphonic/index.ts
+++ b/lib/core/polyphonic/index.ts
@@ -15,7 +15,7 @@ import {
  getFinalParts,
 } from '@/core/pinyin/handle';
 import { getCustomPolyphonicDict } from '../custom';
-import { isZhChar } from '@/common/utils';
+import { isZhChar, splitString } from '@/common/utils';

 interface BasicOptions {
  /**
@@ -206,7 +206,7 @@ function polyphonic(

 // 获取每个字多音字的数组
 const getPolyphonicList = (text: string): SingleWordResult[] => {
-  return text.split('').map((word) => {
+  return splitString(text).map((word) => {
    const wordCode = word.charCodeAt(0);
    const customPolyphonicDict = getCustomPolyphonicDict();
    const pinyin = customPolyphonicDict[wordCode] || DICT1[wordCode] || word;
--- a/test/match.test.js
+++ b/test/match.test.js
@@ -1,4 +1,4 @@
-import { match } from '../lib/index';
+import { match, customPinyin, clearCustomDict } from '../lib/index';
 import { expect, describe, it } from 'vitest';

 describe('match', () => {
@@ -90,6 +90,17 @@ describe('match', () => {
    expect(result).to.deep.equal([2, 4]);
  });

+  it('[match]first&space', () => {
+    customPinyin({
+      𧒽: 'lei'
+    }, {
+      multiple: 'replace'
+    })
+    const result = match('𧒽测 试', 'l c s');
+    expect(result).to.deep.equal([0, 1, 2, 4]);
+    clearCustomDict(['pinyin', 'multiple', 'polyphonic']);
+  });
+
  it('[match]nonZh match', () => {
    const result = match('测uuuuuuuuuu试', 'cuuuuuu');
    expect(result).to.deep.equal([0, 1, 2, 3, 4, 5, 6]);
--- a/types/common/utils.d.ts
+++ b/types/common/utils.d.ts
@@ -1,2 +1,3 @@
-export declare function getStringLength(string: string): number;
+export declare function stringLength(text: string): number;
+export declare function splitString(text: string): string[];
 export declare function isZhChar(char: string): boolean;
--- a/types/core/match/index.d.ts
+++ b/types/core/match/index.d.ts
@@ -2,7 +2,7 @@ interface MatchOptions {
    /**
     * @description 每个汉字和拼音需要遵从的匹配精度
     */
-    precision?: 'first' | 'start' | 'every' | 'any';
+    precision?: "first" | "start" | "every" | "any";
    /**
     * @description 匹配的汉字下标是否为连续的才算匹配成功
     */
@@ -10,11 +10,11 @@ interface MatchOptions {
    /**
     * @description 匹配时对于空格的处理
     */
-    space?: 'ignore' | 'preserve';
+    space?: "ignore" | "preserve";
    /**
     * @description 最后一个字的匹配精度
     */
-    lastPrecision?: 'first' | 'start' | 'every' | 'any';
+    lastPrecision?: "first" | "start" | "every" | "any";
    /**
     * @description 是否大小写不敏感
     */
@@ -27,5 +27,5 @@ interface MatchOptions {
 * @param {MatchOptions=} options 配置项
 * @return {Array | null} 若匹配成功，返回 text 中匹配成功的下标数组；若匹配失败，返回 null
 */
-export declare const match: (text: string, pinyin: string, options?: MatchOptions) => any;
+export declare const match: (text: string, pinyin: string, options?: MatchOptions) => number[] | null;
 export {};