mirror of
https://github.com/zh-lx/pinyin-pro.git
synced 2026-03-13 09:51:38 +08:00
feat: add traditional Chinese character recognition mode
Add support for traditional Chinese characters with the `traditional` option. This includes new `addTraditionalDict` and `getTraditionalDict` APIs, and integration with pinyin, html, and segment functions. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,7 @@ export interface SingleWordResult {
|
||||
result: string;
|
||||
isZh: boolean;
|
||||
delete?: boolean;
|
||||
traditional?: string;
|
||||
}
|
||||
|
||||
// toneType 属性可选参数
|
||||
@@ -72,4 +73,10 @@ export type CommonOptions = {
|
||||
* @value standard:不将 `y`、`w` 视为声母
|
||||
*/
|
||||
initialPattern?: InitialPattern;
|
||||
/**
|
||||
* @description 是否启用繁体字模式(可以更好地识别繁体字)
|
||||
* @value false:不启用繁体字模式 (默认值)
|
||||
* @value true:启用繁体字模式
|
||||
*/
|
||||
traditional?: boolean;
|
||||
};
|
||||
|
||||
@@ -1,6 +1,12 @@
|
||||
import { pinyin } from "@/core/pinyin";
|
||||
import type { BasicOptions } from "../../core/pinyin";
|
||||
|
||||
interface HtmlOptions {
|
||||
type HtmlBaseOptions = Pick<
|
||||
BasicOptions,
|
||||
"toneType" | "v" | "toneSandhi" | "segmentit" | "traditional" | "surname" | "mode"
|
||||
>;
|
||||
|
||||
interface HtmlOptions extends HtmlBaseOptions {
|
||||
/**
|
||||
* @description html 结果中每个字+拼音外层 span 标签的类名。默认为 py-result-item
|
||||
*/
|
||||
@@ -21,35 +27,18 @@ interface HtmlOptions {
|
||||
* @description html 非汉字字符外层 span 标签的类名,仅当 wrapNonChinese 为 true 时生效。默认为 py-non-chinese-item
|
||||
*/
|
||||
nonChineseClass?: string;
|
||||
/**
|
||||
* @description 拼音上是否标注音调
|
||||
*/
|
||||
toneType?: "symbol" | "num" | "none";
|
||||
/**
|
||||
* @description 对于指定的汉字及字符,在 result 上额外补充的拼音
|
||||
*/
|
||||
customClassMap?: {
|
||||
[classname: string]: string[];
|
||||
};
|
||||
/**
|
||||
* @description 是否开启「一」和 「不」字的变调。默认开启。参考:https://zh.wiktionary.org/wiki/Appendix:%E2%80%9C%E4%B8%80%E2%80%9D%E5%8F%8A%E2%80%9C%E4%B8%8D%E2%80%9D%E7%9A%84%E5%8F%98%E8%B0%83
|
||||
* @value true:开启
|
||||
* @value false:不开启
|
||||
*/
|
||||
toneSandhi?: boolean;
|
||||
/**
|
||||
* @description 是否保留 <rp>(</rp> 标签,默认为保留
|
||||
* @value true:保留 <rp>(</rp>
|
||||
* @value false:移除 <rp>(</rp>
|
||||
*/
|
||||
rp?: boolean;
|
||||
/**
|
||||
* @description 对于 ü 的返回是否转换成 v(仅在 toneType: none 启用时生效)
|
||||
* @value false:返回值中保留 ü (默认值)
|
||||
* @value true:返回值中 ü 转换成 v
|
||||
* @value string:返回值中 ü 转换成指定字符
|
||||
*/
|
||||
v?: boolean | string;
|
||||
}
|
||||
|
||||
const DefaultHtmlOptions: HtmlOptions = {
|
||||
@@ -63,6 +52,7 @@ const DefaultHtmlOptions: HtmlOptions = {
|
||||
toneSandhi: true,
|
||||
rp: true,
|
||||
v: false,
|
||||
traditional: false,
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -78,9 +68,7 @@ export const html = (text: string, options?: HtmlOptions) => {
|
||||
} as Required<HtmlOptions>;
|
||||
const pinyinArray = pinyin(text, {
|
||||
type: "all",
|
||||
toneType: completeOptions.toneType,
|
||||
toneSandhi: options?.toneSandhi,
|
||||
v: completeOptions.v,
|
||||
...completeOptions,
|
||||
});
|
||||
const result = pinyinArray.map((item) => {
|
||||
let additionalClass = "";
|
||||
|
||||
@@ -18,6 +18,7 @@ import {
|
||||
} from "../../common/segmentit";
|
||||
import { Priority } from "@/common/constant";
|
||||
import { splitString } from "@/common/utils";
|
||||
import { getTraditionalDict } from "../traditional";
|
||||
|
||||
/**
|
||||
* @description: 获取单个字符的拼音
|
||||
@@ -31,13 +32,30 @@ export const getSingleWordPinyin: GetSingleWordPinyin = (char) => {
|
||||
return pinyin ? pinyin.split(" ")[0] : char;
|
||||
};
|
||||
|
||||
const getTraditionalWords = (word: string): string => {
|
||||
const traditionalWords: string[] = [];
|
||||
const traditionalDict = getTraditionalDict();
|
||||
for (let i = 0; i < word.length; i++) {
|
||||
const key = word[i];
|
||||
const code = key.charCodeAt(0);
|
||||
if (traditionalDict[code]) {
|
||||
traditionalWords[i] = traditionalDict[code];
|
||||
} else {
|
||||
traditionalWords[i] = key;
|
||||
}
|
||||
}
|
||||
return traditionalWords.join("");
|
||||
};
|
||||
|
||||
export const getPinyin = (
|
||||
word: string,
|
||||
list: SingleWordResult[],
|
||||
surname: SurnameMode,
|
||||
segmentit: TokenizationAlgorithm
|
||||
segmentit: TokenizationAlgorithm,
|
||||
traditional?: boolean,
|
||||
): { list: SingleWordResult[]; matches: MatchPattern[] } => {
|
||||
const matches = acTree.search(word, surname, segmentit);
|
||||
const searchWord = traditional ? getTraditionalWords(word) : word;
|
||||
const matches = acTree.search(searchWord, surname, segmentit);
|
||||
let matchIndex = 0;
|
||||
const zhChars = splitString(word);
|
||||
for (let i = 0; i < zhChars.length; ) {
|
||||
@@ -45,6 +63,7 @@ export const getPinyin = (
|
||||
if (match && i === match.index) {
|
||||
if (match.length === 1 && match.priority <= Priority.Normal) {
|
||||
const char = zhChars[i];
|
||||
match.zh = char;
|
||||
let pinyin: string = "";
|
||||
pinyin = processSepecialPinyin(char, zhChars[i - 1], zhChars[i + 1]);
|
||||
list[i] = {
|
||||
@@ -59,10 +78,12 @@ export const getPinyin = (
|
||||
}
|
||||
const pinyins = match.pinyin.split(" ");
|
||||
let pinyinIndex = 0;
|
||||
if (traditional) {
|
||||
match.zh = zhChars.slice(match.index, match.index + match.length).join("");
|
||||
}
|
||||
for (let j = 0; j < match.length; j++) {
|
||||
const zhChars = splitString(match.zh);
|
||||
list[i + j] = {
|
||||
origin: zhChars[j],
|
||||
origin: zhChars[j + match.index],
|
||||
result: pinyins[pinyinIndex] || "",
|
||||
isZh: true,
|
||||
originPinyin: pinyins[pinyinIndex] || "",
|
||||
@@ -121,7 +142,7 @@ export const getAllPinyin: GetAllPinyin = (char, surname = "off") => {
|
||||
const surnamePinyin = Surnames[char];
|
||||
if (surnamePinyin) {
|
||||
pinyin = [surnamePinyin].concat(
|
||||
pinyin.filter((py) => py !== surnamePinyin)
|
||||
pinyin.filter((py) => py !== surnamePinyin),
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -135,7 +156,7 @@ export const getAllPinyin: GetAllPinyin = (char, surname = "off") => {
|
||||
*/
|
||||
type GetMultiplePinyin = (
|
||||
word: string,
|
||||
surname?: SurnameMode
|
||||
surname?: SurnameMode,
|
||||
) => SingleWordResult[];
|
||||
const getMultiplePinyin: GetMultiplePinyin = (word, surname = "off") => {
|
||||
let pinyin = getAllPinyin(word, surname);
|
||||
@@ -166,7 +187,7 @@ const getMultiplePinyin: GetMultiplePinyin = (word, surname = "off") => {
|
||||
*/
|
||||
type GetInitialAndFinal = (
|
||||
pinyin: string,
|
||||
initialPattern?: InitialPattern
|
||||
initialPattern?: InitialPattern,
|
||||
) => {
|
||||
final: string;
|
||||
initial: string;
|
||||
|
||||
@@ -163,7 +163,7 @@ function pinyin(word: string, options?: OptionsReturnAll): AllData[];
|
||||
*/
|
||||
function pinyin(
|
||||
word: string,
|
||||
options?: CompleteOptions
|
||||
options?: CompleteOptions,
|
||||
): string | string[] | AllData[] {
|
||||
options = { ...DEFAULT_OPTIONS, ...(options || {}) };
|
||||
// 校验 word 类型是否正确
|
||||
@@ -203,7 +203,8 @@ function pinyin(
|
||||
word,
|
||||
_list,
|
||||
options.surname as SurnameMode,
|
||||
options.segmentit as TokenizationAlgorithm
|
||||
options.segmentit as TokenizationAlgorithm,
|
||||
options.traditional,
|
||||
);
|
||||
|
||||
// 一和不变调处理
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { BasicOptions } from "../pinyin";
|
||||
import type { BasicOptions } from "../pinyin";
|
||||
import { TokenizationAlgorithm } from "../../common/segmentit";
|
||||
import { stringLength } from "@/common/utils";
|
||||
import { middleWareNonZh, middlewareToneSandhi, middlewareToneType, middlewareV, validateType } from "@/core/pinyin/middlewares";
|
||||
@@ -8,7 +8,7 @@ import { middlewareOutputFormat, middlewareSegment, Output, OutputFormat } from
|
||||
|
||||
type SegmentBaseOptions = Pick<
|
||||
BasicOptions,
|
||||
"toneType" | "mode" | "surname" | "nonZh" | "v" | "toneSandhi" | "segmentit"
|
||||
"toneType" | "mode" | "surname" | "nonZh" | "v" | "toneSandhi" | "segmentit" | "traditional"
|
||||
>;
|
||||
|
||||
interface AllSegmentReturnOptions extends SegmentBaseOptions {
|
||||
@@ -103,6 +103,7 @@ const DEFAULT_OPTIONS: SegmentCompleteOptions = {
|
||||
toneSandhi: true,
|
||||
segmentit: TokenizationAlgorithm.MaxProbability,
|
||||
format: OutputFormat.AllSegment,
|
||||
traditional: false,
|
||||
};
|
||||
|
||||
export function segment(word: string, options?: AllSegmentReturnOptions): Output['AllSegment'];
|
||||
@@ -138,7 +139,8 @@ export function segment(word: string, options?: SegmentCompleteOptions) {
|
||||
word,
|
||||
_list,
|
||||
options.surname as SurnameMode,
|
||||
options.segmentit as TokenizationAlgorithm
|
||||
options.segmentit as TokenizationAlgorithm,
|
||||
options.traditional as boolean
|
||||
);
|
||||
|
||||
// 一和不变调处理
|
||||
|
||||
13
lib/core/traditional/index.ts
Normal file
13
lib/core/traditional/index.ts
Normal file
@@ -0,0 +1,13 @@
|
||||
const traditionalDict: string[] = [];
|
||||
|
||||
export function addTraditionalDict(dict: Record<string, string>) {
|
||||
for (let key in dict) {
|
||||
const value = dict[key];
|
||||
const code = key.charCodeAt(0);
|
||||
traditionalDict[code] = value;
|
||||
}
|
||||
}
|
||||
|
||||
export function getTraditionalDict() {
|
||||
return traditionalDict;
|
||||
}
|
||||
23
lib/index.ts
23
lib/index.ts
@@ -1,9 +1,14 @@
|
||||
export { getInitialAndFinal, getFinalParts, getNumOfTone } from './core/pinyin/handle';
|
||||
export { pinyin } from './core/pinyin';
|
||||
export { customPinyin, clearCustomDict } from './core/custom';
|
||||
export { addDict, removeDict } from './core/dict';
|
||||
export { match } from './core/match';
|
||||
export { html } from './core/html';
|
||||
export { polyphonic } from './core/polyphonic';
|
||||
export { convert } from './core/convert';
|
||||
export { segment, OutputFormat } from './core/segment';
|
||||
export {
|
||||
getInitialAndFinal,
|
||||
getFinalParts,
|
||||
getNumOfTone,
|
||||
} from "./core/pinyin/handle";
|
||||
export { pinyin } from "./core/pinyin";
|
||||
export { customPinyin, clearCustomDict } from "./core/custom";
|
||||
export { addDict, removeDict } from "./core/dict";
|
||||
export { match } from "./core/match";
|
||||
export { html } from "./core/html";
|
||||
export { polyphonic } from "./core/polyphonic";
|
||||
export { convert } from "./core/convert";
|
||||
export { segment, OutputFormat } from "./core/segment";
|
||||
export { addTraditionalDict, getTraditionalDict } from "./core/traditional";
|
||||
|
||||
@@ -54,7 +54,7 @@
|
||||
"devDependencies": {
|
||||
"@commitlint/cli": "^11.0.0",
|
||||
"@commitlint/config-conventional": "^11.0.0",
|
||||
"@pinyin-pro/data": "1.0.3",
|
||||
"@pinyin-pro/data": "1.3.0",
|
||||
"@rollup/plugin-commonjs": "^17.1.0",
|
||||
"@rollup/plugin-json": "^4.1.0",
|
||||
"@rollup/plugin-node-resolve": "^11.2.0",
|
||||
|
||||
69
test/traditional.test.js
Normal file
69
test/traditional.test.js
Normal file
@@ -0,0 +1,69 @@
|
||||
import { pinyin, html, addTraditionalDict, segment } from '../lib/index';
|
||||
import traditionalDict from '@pinyin-pro/data/traditional'
|
||||
import { expect, describe, it } from 'vitest';
|
||||
|
||||
describe("without traditional", () => {
|
||||
it("[pinyin traditional]轉盤", () => {
|
||||
const result = pinyin("轉盤");
|
||||
expect(result).to.be.equal("zhuǎn pán");
|
||||
});
|
||||
|
||||
it("[segment traditional]一个轉盤", () => {
|
||||
const result = segment("一个轉盤");
|
||||
expect(result).to.deep.equal([
|
||||
{
|
||||
"origin": "一",
|
||||
"result": "yí",
|
||||
},
|
||||
{
|
||||
"origin": "个",
|
||||
"result": "gè",
|
||||
},
|
||||
{
|
||||
"origin": "轉",
|
||||
"result": "zhuǎn",
|
||||
},
|
||||
{
|
||||
"origin": "盤",
|
||||
"result": "pán",
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it("[html traditional]轉盤", () => {
|
||||
const result = html("轉盤");
|
||||
expect(result).to.be.equal('<span class="py-result-item"><ruby><span class="py-chinese-item">轉</span><rp>(</rp><rt class="py-pinyin-item">zhuǎn</rt><rp>)</rp></ruby></span><span class="py-result-item"><ruby><span class="py-chinese-item">盤</span><rp>(</rp><rt class="py-pinyin-item">pán</rt><rp>)</rp></ruby></span>');
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
describe("with traditional", () => {
|
||||
addTraditionalDict(traditionalDict);
|
||||
it("[pinyin with traditional]一个🌛轉盤", () => {
|
||||
const result = pinyin("一个🌛轉盤", { traditional: true });
|
||||
expect(result).to.be.equal("yí gè 🌛 zhuàn pán");
|
||||
});
|
||||
|
||||
it("[segment traditional]一个轉盤", () => {
|
||||
const result = segment("一个轉盤", { traditional: true });
|
||||
expect(result).to.deep.equal([
|
||||
{
|
||||
"origin": "一",
|
||||
"result": "yí",
|
||||
},
|
||||
{
|
||||
"origin": "个",
|
||||
"result": "gè",
|
||||
},
|
||||
{
|
||||
"origin": "轉盤",
|
||||
"result": "zhuànpán",
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it("[html with traditional]轉盤", () => {
|
||||
const result = html("轉盤", { traditional: true });
|
||||
expect(result).to.be.equal('<span class="py-result-item"><ruby><span class="py-chinese-item">轉</span><rp>(</rp><rt class="py-pinyin-item">zhuàn</rt><rp>)</rp></ruby></span><span class="py-result-item"><ruby><span class="py-chinese-item">盤</span><rp>(</rp><rt class="py-pinyin-item">pán</rt><rp>)</rp></ruby></span>');
|
||||
});
|
||||
});
|
||||
7
types/common/type.d.ts
vendored
7
types/common/type.d.ts
vendored
@@ -4,6 +4,7 @@ export interface SingleWordResult {
|
||||
result: string;
|
||||
isZh: boolean;
|
||||
delete?: boolean;
|
||||
traditional?: string;
|
||||
}
|
||||
export type ToneType = "symbol" | "num" | "none";
|
||||
export type PinyinMode = "normal" | "surname";
|
||||
@@ -59,4 +60,10 @@ export type CommonOptions = {
|
||||
* @value standard:不将 `y`、`w` 视为声母
|
||||
*/
|
||||
initialPattern?: InitialPattern;
|
||||
/**
|
||||
* @description 是否启用繁体字模式(可以更好地识别繁体字)
|
||||
* @value false:不启用繁体字模式 (默认值)
|
||||
* @value true:启用繁体字模式
|
||||
*/
|
||||
traditional?: boolean;
|
||||
};
|
||||
|
||||
21
types/core/html/index.d.ts
vendored
21
types/core/html/index.d.ts
vendored
@@ -1,4 +1,6 @@
|
||||
interface HtmlOptions {
|
||||
import type { BasicOptions } from "../../core/pinyin";
|
||||
type HtmlBaseOptions = Pick<BasicOptions, "toneType" | "v" | "toneSandhi" | "segmentit" | "traditional" | "surname" | "mode">;
|
||||
interface HtmlOptions extends HtmlBaseOptions {
|
||||
/**
|
||||
* @description html 结果中每个字+拼音外层 span 标签的类名。默认为 py-result-item
|
||||
*/
|
||||
@@ -19,35 +21,18 @@ interface HtmlOptions {
|
||||
* @description html 非汉字字符外层 span 标签的类名,仅当 wrapNonChinese 为 true 时生效。默认为 py-non-chinese-item
|
||||
*/
|
||||
nonChineseClass?: string;
|
||||
/**
|
||||
* @description 拼音上是否标注音调
|
||||
*/
|
||||
toneType?: "symbol" | "num" | "none";
|
||||
/**
|
||||
* @description 对于指定的汉字及字符,在 result 上额外补充的拼音
|
||||
*/
|
||||
customClassMap?: {
|
||||
[classname: string]: string[];
|
||||
};
|
||||
/**
|
||||
* @description 是否开启「一」和 「不」字的变调。默认开启。参考:https://zh.wiktionary.org/wiki/Appendix:%E2%80%9C%E4%B8%80%E2%80%9D%E5%8F%8A%E2%80%9C%E4%B8%8D%E2%80%9D%E7%9A%84%E5%8F%98%E8%B0%83
|
||||
* @value true:开启
|
||||
* @value false:不开启
|
||||
*/
|
||||
toneSandhi?: boolean;
|
||||
/**
|
||||
* @description 是否保留 <rp>(</rp> 标签,默认为保留
|
||||
* @value true:保留 <rp>(</rp>
|
||||
* @value false:移除 <rp>(</rp>
|
||||
*/
|
||||
rp?: boolean;
|
||||
/**
|
||||
* @description 对于 ü 的返回是否转换成 v(仅在 toneType: none 启用时生效)
|
||||
* @value false:返回值中保留 ü (默认值)
|
||||
* @value true:返回值中 ü 转换成 v
|
||||
* @value string:返回值中 ü 转换成指定字符
|
||||
*/
|
||||
v?: boolean | string;
|
||||
}
|
||||
/**
|
||||
* @description: 获取带拼音汉字的 html 字符串
|
||||
|
||||
2
types/core/pinyin/handle.d.ts
vendored
2
types/core/pinyin/handle.d.ts
vendored
@@ -8,7 +8,7 @@ import { MatchPattern, TokenizationAlgorithm } from "../../common/segmentit";
|
||||
*/
|
||||
type GetSingleWordPinyin = (char: string) => string;
|
||||
export declare const getSingleWordPinyin: GetSingleWordPinyin;
|
||||
export declare const getPinyin: (word: string, list: SingleWordResult[], surname: SurnameMode, segmentit: TokenizationAlgorithm) => {
|
||||
export declare const getPinyin: (word: string, list: SingleWordResult[], surname: SurnameMode, segmentit: TokenizationAlgorithm, traditional?: boolean) => {
|
||||
list: SingleWordResult[];
|
||||
matches: MatchPattern[];
|
||||
};
|
||||
|
||||
4
types/core/segment/index.d.ts
vendored
4
types/core/segment/index.d.ts
vendored
@@ -1,6 +1,6 @@
|
||||
import { BasicOptions } from "../pinyin";
|
||||
import type { BasicOptions } from "../pinyin";
|
||||
import { Output, OutputFormat } from "./middlewares";
|
||||
type SegmentBaseOptions = Pick<BasicOptions, "toneType" | "mode" | "surname" | "nonZh" | "v" | "toneSandhi" | "segmentit">;
|
||||
type SegmentBaseOptions = Pick<BasicOptions, "toneType" | "mode" | "surname" | "nonZh" | "v" | "toneSandhi" | "segmentit" | "traditional">;
|
||||
interface AllSegmentReturnOptions extends SegmentBaseOptions {
|
||||
/**
|
||||
* @description 以片段格式返回全部信息
|
||||
|
||||
2
types/core/traditional/index.d.ts
vendored
Normal file
2
types/core/traditional/index.d.ts
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
export declare function addTraditionalDict(dict: Record<string, string>): void;
|
||||
export declare function getTraditionalDict(): string[];
|
||||
2
types/data/traditional-to-simplified.d.ts
vendored
Normal file
2
types/data/traditional-to-simplified.d.ts
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
declare const TraditionalListMap: string[];
|
||||
export { TraditionalListMap };
|
||||
19
types/index.d.ts
vendored
19
types/index.d.ts
vendored
@@ -1,9 +1,10 @@
|
||||
export { getInitialAndFinal, getFinalParts, getNumOfTone } from './core/pinyin/handle';
|
||||
export { pinyin } from './core/pinyin';
|
||||
export { customPinyin, clearCustomDict } from './core/custom';
|
||||
export { addDict, removeDict } from './core/dict';
|
||||
export { match } from './core/match';
|
||||
export { html } from './core/html';
|
||||
export { polyphonic } from './core/polyphonic';
|
||||
export { convert } from './core/convert';
|
||||
export { segment, OutputFormat } from './core/segment';
|
||||
export { getInitialAndFinal, getFinalParts, getNumOfTone, } from "./core/pinyin/handle";
|
||||
export { pinyin } from "./core/pinyin";
|
||||
export { customPinyin, clearCustomDict } from "./core/custom";
|
||||
export { addDict, removeDict } from "./core/dict";
|
||||
export { match } from "./core/match";
|
||||
export { html } from "./core/html";
|
||||
export { polyphonic } from "./core/polyphonic";
|
||||
export { convert } from "./core/convert";
|
||||
export { segment, OutputFormat } from "./core/segment";
|
||||
export { addTraditionalDict, getTraditionalDict } from "./core/traditional";
|
||||
|
||||
Reference in New Issue
Block a user