feat: add traditional Chinese character recognition mode

Add support for traditional Chinese characters with the `traditional` option.
This includes new `addTraditionalDict` and `getTraditionalDict` APIs,
and integration with pinyin, html, and segment functions.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
zhoulixiang
2026-01-18 12:38:30 +08:00
parent d7b38bb846
commit 7f658c978b
16 changed files with 176 additions and 73 deletions

View File

@@ -5,6 +5,7 @@ export interface SingleWordResult {
result: string;
isZh: boolean;
delete?: boolean;
traditional?: string;
}
// toneType 属性可选参数
@@ -72,4 +73,10 @@ export type CommonOptions = {
* @value standard不将 `y`、`w` 视为声母
*/
initialPattern?: InitialPattern;
/**
* @description 是否启用繁体字模式(可以更好地识别繁体字)
* @value false不启用繁体字模式 (默认值)
* @value true启用繁体字模式
*/
traditional?: boolean;
};

View File

@@ -1,6 +1,12 @@
import { pinyin } from "@/core/pinyin";
import type { BasicOptions } from "../../core/pinyin";
interface HtmlOptions {
type HtmlBaseOptions = Pick<
BasicOptions,
"toneType" | "v" | "toneSandhi" | "segmentit" | "traditional" | "surname" | "mode"
>;
interface HtmlOptions extends HtmlBaseOptions {
/**
* @description html 结果中每个字+拼音外层 span 标签的类名。默认为 py-result-item
*/
@@ -21,35 +27,18 @@ interface HtmlOptions {
* @description html 非汉字字符外层 span 标签的类名,仅当 wrapNonChinese 为 true 时生效。默认为 py-non-chinese-item
*/
nonChineseClass?: string;
/**
* @description 拼音上是否标注音调
*/
toneType?: "symbol" | "num" | "none";
/**
* @description 对于指定的汉字及字符,在 result 上额外补充的拼音
*/
customClassMap?: {
[classname: string]: string[];
};
/**
* @description 是否开启「一」和 「不」字的变调。默认开启。参考https://zh.wiktionary.org/wiki/Appendix:%E2%80%9C%E4%B8%80%E2%80%9D%E5%8F%8A%E2%80%9C%E4%B8%8D%E2%80%9D%E7%9A%84%E5%8F%98%E8%B0%83
* @value true开启
* @value false不开启
*/
toneSandhi?: boolean;
/**
* @description 是否保留 <rp>(</rp> 标签,默认为保留
* @value true保留 <rp>(</rp>
* @value false移除 <rp>(</rp>
*/
rp?: boolean;
/**
* @description 对于 ü 的返回是否转换成 v仅在 toneType: none 启用时生效)
* @value false返回值中保留 ü (默认值)
* @value true返回值中 ü 转换成 v
* @value string返回值中 ü 转换成指定字符
*/
v?: boolean | string;
}
const DefaultHtmlOptions: HtmlOptions = {
@@ -63,6 +52,7 @@ const DefaultHtmlOptions: HtmlOptions = {
toneSandhi: true,
rp: true,
v: false,
traditional: false,
};
/**
@@ -78,9 +68,7 @@ export const html = (text: string, options?: HtmlOptions) => {
} as Required<HtmlOptions>;
const pinyinArray = pinyin(text, {
type: "all",
toneType: completeOptions.toneType,
toneSandhi: options?.toneSandhi,
v: completeOptions.v,
...completeOptions,
});
const result = pinyinArray.map((item) => {
let additionalClass = "";

View File

@@ -18,6 +18,7 @@ import {
} from "../../common/segmentit";
import { Priority } from "@/common/constant";
import { splitString } from "@/common/utils";
import { getTraditionalDict } from "../traditional";
/**
* @description: 获取单个字符的拼音
@@ -31,13 +32,30 @@ export const getSingleWordPinyin: GetSingleWordPinyin = (char) => {
return pinyin ? pinyin.split(" ")[0] : char;
};
const getTraditionalWords = (word: string): string => {
const traditionalWords: string[] = [];
const traditionalDict = getTraditionalDict();
for (let i = 0; i < word.length; i++) {
const key = word[i];
const code = key.charCodeAt(0);
if (traditionalDict[code]) {
traditionalWords[i] = traditionalDict[code];
} else {
traditionalWords[i] = key;
}
}
return traditionalWords.join("");
};
export const getPinyin = (
word: string,
list: SingleWordResult[],
surname: SurnameMode,
segmentit: TokenizationAlgorithm
segmentit: TokenizationAlgorithm,
traditional?: boolean,
): { list: SingleWordResult[]; matches: MatchPattern[] } => {
const matches = acTree.search(word, surname, segmentit);
const searchWord = traditional ? getTraditionalWords(word) : word;
const matches = acTree.search(searchWord, surname, segmentit);
let matchIndex = 0;
const zhChars = splitString(word);
for (let i = 0; i < zhChars.length; ) {
@@ -45,6 +63,7 @@ export const getPinyin = (
if (match && i === match.index) {
if (match.length === 1 && match.priority <= Priority.Normal) {
const char = zhChars[i];
match.zh = char;
let pinyin: string = "";
pinyin = processSepecialPinyin(char, zhChars[i - 1], zhChars[i + 1]);
list[i] = {
@@ -59,10 +78,12 @@ export const getPinyin = (
}
const pinyins = match.pinyin.split(" ");
let pinyinIndex = 0;
if (traditional) {
match.zh = zhChars.slice(match.index, match.index + match.length).join("");
}
for (let j = 0; j < match.length; j++) {
const zhChars = splitString(match.zh);
list[i + j] = {
origin: zhChars[j],
origin: zhChars[j + match.index],
result: pinyins[pinyinIndex] || "",
isZh: true,
originPinyin: pinyins[pinyinIndex] || "",
@@ -121,7 +142,7 @@ export const getAllPinyin: GetAllPinyin = (char, surname = "off") => {
const surnamePinyin = Surnames[char];
if (surnamePinyin) {
pinyin = [surnamePinyin].concat(
pinyin.filter((py) => py !== surnamePinyin)
pinyin.filter((py) => py !== surnamePinyin),
);
}
}
@@ -135,7 +156,7 @@ export const getAllPinyin: GetAllPinyin = (char, surname = "off") => {
*/
type GetMultiplePinyin = (
word: string,
surname?: SurnameMode
surname?: SurnameMode,
) => SingleWordResult[];
const getMultiplePinyin: GetMultiplePinyin = (word, surname = "off") => {
let pinyin = getAllPinyin(word, surname);
@@ -166,7 +187,7 @@ const getMultiplePinyin: GetMultiplePinyin = (word, surname = "off") => {
*/
type GetInitialAndFinal = (
pinyin: string,
initialPattern?: InitialPattern
initialPattern?: InitialPattern,
) => {
final: string;
initial: string;

View File

@@ -163,7 +163,7 @@ function pinyin(word: string, options?: OptionsReturnAll): AllData[];
*/
function pinyin(
word: string,
options?: CompleteOptions
options?: CompleteOptions,
): string | string[] | AllData[] {
options = { ...DEFAULT_OPTIONS, ...(options || {}) };
// 校验 word 类型是否正确
@@ -203,7 +203,8 @@ function pinyin(
word,
_list,
options.surname as SurnameMode,
options.segmentit as TokenizationAlgorithm
options.segmentit as TokenizationAlgorithm,
options.traditional,
);
// 一和不变调处理

View File

@@ -1,4 +1,4 @@
import { BasicOptions } from "../pinyin";
import type { BasicOptions } from "../pinyin";
import { TokenizationAlgorithm } from "../../common/segmentit";
import { stringLength } from "@/common/utils";
import { middleWareNonZh, middlewareToneSandhi, middlewareToneType, middlewareV, validateType } from "@/core/pinyin/middlewares";
@@ -8,7 +8,7 @@ import { middlewareOutputFormat, middlewareSegment, Output, OutputFormat } from
type SegmentBaseOptions = Pick<
BasicOptions,
"toneType" | "mode" | "surname" | "nonZh" | "v" | "toneSandhi" | "segmentit"
"toneType" | "mode" | "surname" | "nonZh" | "v" | "toneSandhi" | "segmentit" | "traditional"
>;
interface AllSegmentReturnOptions extends SegmentBaseOptions {
@@ -103,6 +103,7 @@ const DEFAULT_OPTIONS: SegmentCompleteOptions = {
toneSandhi: true,
segmentit: TokenizationAlgorithm.MaxProbability,
format: OutputFormat.AllSegment,
traditional: false,
};
export function segment(word: string, options?: AllSegmentReturnOptions): Output['AllSegment'];
@@ -138,7 +139,8 @@ export function segment(word: string, options?: SegmentCompleteOptions) {
word,
_list,
options.surname as SurnameMode,
options.segmentit as TokenizationAlgorithm
options.segmentit as TokenizationAlgorithm,
options.traditional as boolean
);
// 一和不变调处理

View File

@@ -0,0 +1,13 @@
const traditionalDict: string[] = [];
export function addTraditionalDict(dict: Record<string, string>) {
for (let key in dict) {
const value = dict[key];
const code = key.charCodeAt(0);
traditionalDict[code] = value;
}
}
export function getTraditionalDict() {
return traditionalDict;
}

View File

@@ -1,9 +1,14 @@
export { getInitialAndFinal, getFinalParts, getNumOfTone } from './core/pinyin/handle';
export { pinyin } from './core/pinyin';
export { customPinyin, clearCustomDict } from './core/custom';
export { addDict, removeDict } from './core/dict';
export { match } from './core/match';
export { html } from './core/html';
export { polyphonic } from './core/polyphonic';
export { convert } from './core/convert';
export { segment, OutputFormat } from './core/segment';
export {
getInitialAndFinal,
getFinalParts,
getNumOfTone,
} from "./core/pinyin/handle";
export { pinyin } from "./core/pinyin";
export { customPinyin, clearCustomDict } from "./core/custom";
export { addDict, removeDict } from "./core/dict";
export { match } from "./core/match";
export { html } from "./core/html";
export { polyphonic } from "./core/polyphonic";
export { convert } from "./core/convert";
export { segment, OutputFormat } from "./core/segment";
export { addTraditionalDict, getTraditionalDict } from "./core/traditional";

View File

@@ -54,7 +54,7 @@
"devDependencies": {
"@commitlint/cli": "^11.0.0",
"@commitlint/config-conventional": "^11.0.0",
"@pinyin-pro/data": "1.0.3",
"@pinyin-pro/data": "1.3.0",
"@rollup/plugin-commonjs": "^17.1.0",
"@rollup/plugin-json": "^4.1.0",
"@rollup/plugin-node-resolve": "^11.2.0",

69
test/traditional.test.js Normal file
View File

@@ -0,0 +1,69 @@
import { pinyin, html, addTraditionalDict, segment } from '../lib/index';
import traditionalDict from '@pinyin-pro/data/traditional'
import { expect, describe, it } from 'vitest';
describe("without traditional", () => {
it("[pinyin traditional]轉盤", () => {
const result = pinyin("轉盤");
expect(result).to.be.equal("zhuǎn pán");
});
it("[segment traditional]一个轉盤", () => {
const result = segment("一个轉盤");
expect(result).to.deep.equal([
{
"origin": "一",
"result": "yí",
},
{
"origin": "个",
"result": "gè",
},
{
"origin": "轉",
"result": "zhuǎn",
},
{
"origin": "盤",
"result": "pán",
},
]);
});
it("[html traditional]轉盤", () => {
const result = html("轉盤");
expect(result).to.be.equal('<span class="py-result-item"><ruby><span class="py-chinese-item">轉</span><rp>(</rp><rt class="py-pinyin-item">zhuǎn</rt><rp>)</rp></ruby></span><span class="py-result-item"><ruby><span class="py-chinese-item">盤</span><rp>(</rp><rt class="py-pinyin-item">pán</rt><rp>)</rp></ruby></span>');
});
});
describe("with traditional", () => {
addTraditionalDict(traditionalDict);
it("[pinyin with traditional]一个🌛轉盤", () => {
const result = pinyin("一个🌛轉盤", { traditional: true });
expect(result).to.be.equal("yí gè 🌛 zhuàn pán");
});
it("[segment traditional]一个轉盤", () => {
const result = segment("一个轉盤", { traditional: true });
expect(result).to.deep.equal([
{
"origin": "一",
"result": "yí",
},
{
"origin": "个",
"result": "gè",
},
{
"origin": "轉盤",
"result": "zhuànpán",
},
]);
});
it("[html with traditional]轉盤", () => {
const result = html("轉盤", { traditional: true });
expect(result).to.be.equal('<span class="py-result-item"><ruby><span class="py-chinese-item">轉</span><rp>(</rp><rt class="py-pinyin-item">zhuàn</rt><rp>)</rp></ruby></span><span class="py-result-item"><ruby><span class="py-chinese-item">盤</span><rp>(</rp><rt class="py-pinyin-item">pán</rt><rp>)</rp></ruby></span>');
});
});

View File

@@ -4,6 +4,7 @@ export interface SingleWordResult {
result: string;
isZh: boolean;
delete?: boolean;
traditional?: string;
}
export type ToneType = "symbol" | "num" | "none";
export type PinyinMode = "normal" | "surname";
@@ -59,4 +60,10 @@ export type CommonOptions = {
* @value standard不将 `y`、`w` 视为声母
*/
initialPattern?: InitialPattern;
/**
* @description 是否启用繁体字模式(可以更好地识别繁体字)
* @value false不启用繁体字模式 (默认值)
* @value true启用繁体字模式
*/
traditional?: boolean;
};

View File

@@ -1,4 +1,6 @@
interface HtmlOptions {
import type { BasicOptions } from "../../core/pinyin";
type HtmlBaseOptions = Pick<BasicOptions, "toneType" | "v" | "toneSandhi" | "segmentit" | "traditional" | "surname" | "mode">;
interface HtmlOptions extends HtmlBaseOptions {
/**
* @description html 结果中每个字+拼音外层 span 标签的类名。默认为 py-result-item
*/
@@ -19,35 +21,18 @@ interface HtmlOptions {
* @description html 非汉字字符外层 span 标签的类名,仅当 wrapNonChinese 为 true 时生效。默认为 py-non-chinese-item
*/
nonChineseClass?: string;
/**
* @description 拼音上是否标注音调
*/
toneType?: "symbol" | "num" | "none";
/**
* @description 对于指定的汉字及字符,在 result 上额外补充的拼音
*/
customClassMap?: {
[classname: string]: string[];
};
/**
* @description 是否开启「一」和 「不」字的变调。默认开启。参考https://zh.wiktionary.org/wiki/Appendix:%E2%80%9C%E4%B8%80%E2%80%9D%E5%8F%8A%E2%80%9C%E4%B8%8D%E2%80%9D%E7%9A%84%E5%8F%98%E8%B0%83
* @value true开启
* @value false不开启
*/
toneSandhi?: boolean;
/**
* @description 是否保留 <rp>(</rp> 标签,默认为保留
* @value true保留 <rp>(</rp>
* @value false移除 <rp>(</rp>
*/
rp?: boolean;
/**
* @description 对于 ü 的返回是否转换成 v仅在 toneType: none 启用时生效)
* @value false返回值中保留 ü (默认值)
* @value true返回值中 ü 转换成 v
* @value string返回值中 ü 转换成指定字符
*/
v?: boolean | string;
}
/**
* @description: 获取带拼音汉字的 html 字符串

View File

@@ -8,7 +8,7 @@ import { MatchPattern, TokenizationAlgorithm } from "../../common/segmentit";
*/
type GetSingleWordPinyin = (char: string) => string;
export declare const getSingleWordPinyin: GetSingleWordPinyin;
export declare const getPinyin: (word: string, list: SingleWordResult[], surname: SurnameMode, segmentit: TokenizationAlgorithm) => {
export declare const getPinyin: (word: string, list: SingleWordResult[], surname: SurnameMode, segmentit: TokenizationAlgorithm, traditional?: boolean) => {
list: SingleWordResult[];
matches: MatchPattern[];
};

View File

@@ -1,6 +1,6 @@
import { BasicOptions } from "../pinyin";
import type { BasicOptions } from "../pinyin";
import { Output, OutputFormat } from "./middlewares";
type SegmentBaseOptions = Pick<BasicOptions, "toneType" | "mode" | "surname" | "nonZh" | "v" | "toneSandhi" | "segmentit">;
type SegmentBaseOptions = Pick<BasicOptions, "toneType" | "mode" | "surname" | "nonZh" | "v" | "toneSandhi" | "segmentit" | "traditional">;
interface AllSegmentReturnOptions extends SegmentBaseOptions {
/**
* @description 以片段格式返回全部信息

2
types/core/traditional/index.d.ts vendored Normal file
View File

@@ -0,0 +1,2 @@
export declare function addTraditionalDict(dict: Record<string, string>): void;
export declare function getTraditionalDict(): string[];

View File

@@ -0,0 +1,2 @@
declare const TraditionalListMap: string[];
export { TraditionalListMap };

19
types/index.d.ts vendored
View File

@@ -1,9 +1,10 @@
export { getInitialAndFinal, getFinalParts, getNumOfTone } from './core/pinyin/handle';
export { pinyin } from './core/pinyin';
export { customPinyin, clearCustomDict } from './core/custom';
export { addDict, removeDict } from './core/dict';
export { match } from './core/match';
export { html } from './core/html';
export { polyphonic } from './core/polyphonic';
export { convert } from './core/convert';
export { segment, OutputFormat } from './core/segment';
export { getInitialAndFinal, getFinalParts, getNumOfTone, } from "./core/pinyin/handle";
export { pinyin } from "./core/pinyin";
export { customPinyin, clearCustomDict } from "./core/custom";
export { addDict, removeDict } from "./core/dict";
export { match } from "./core/match";
export { html } from "./core/html";
export { polyphonic } from "./core/polyphonic";
export { convert } from "./core/convert";
export { segment, OutputFormat } from "./core/segment";
export { addTraditionalDict, getTraditionalDict } from "./core/traditional";