diff --git a/lib/common/type.ts b/lib/common/type.ts index 2163c24..e1ba167 100644 --- a/lib/common/type.ts +++ b/lib/common/type.ts @@ -5,6 +5,7 @@ export interface SingleWordResult { result: string; isZh: boolean; delete?: boolean; + traditional?: string; } // toneType 属性可选参数 @@ -72,4 +73,10 @@ export type CommonOptions = { * @value standard:不将 `y`、`w` 视为声母 */ initialPattern?: InitialPattern; + /** + * @description 是否启用繁体字模式(可以更好地识别繁体字) + * @value false:不启用繁体字模式 (默认值) + * @value true:启用繁体字模式 + */ + traditional?: boolean; }; diff --git a/lib/core/html/index.ts b/lib/core/html/index.ts index d935645..464ef81 100644 --- a/lib/core/html/index.ts +++ b/lib/core/html/index.ts @@ -1,6 +1,12 @@ import { pinyin } from "@/core/pinyin"; +import type { BasicOptions } from "../../core/pinyin"; -interface HtmlOptions { +type HtmlBaseOptions = Pick< + BasicOptions, + "toneType" | "v" | "toneSandhi" | "segmentit" | "traditional" | "surname" | "mode" +>; + +interface HtmlOptions extends HtmlBaseOptions { /** * @description html 结果中每个字+拼音外层 span 标签的类名。默认为 py-result-item */ @@ -21,35 +27,18 @@ interface HtmlOptions { * @description html 非汉字字符外层 span 标签的类名,仅当 wrapNonChinese 为 true 时生效。默认为 py-non-chinese-item */ nonChineseClass?: string; - /** - * @description 拼音上是否标注音调 - */ - toneType?: "symbol" | "num" | "none"; /** * @description 对于指定的汉字及字符,在 result 上额外补充的拼音 */ customClassMap?: { [classname: string]: string[]; }; - /** - * @description 是否开启「一」和 「不」字的变调。默认开启。参考:https://zh.wiktionary.org/wiki/Appendix:%E2%80%9C%E4%B8%80%E2%80%9D%E5%8F%8A%E2%80%9C%E4%B8%8D%E2%80%9D%E7%9A%84%E5%8F%98%E8%B0%83 - * @value true:开启 - * @value false:不开启 - */ - toneSandhi?: boolean; /** * @description 是否保留 ( 标签,默认为保留 * @value true:保留 ( * @value false:移除 ( */ rp?: boolean; - /** - * @description 对于 ü 的返回是否转换成 v(仅在 toneType: none 启用时生效) - * @value false:返回值中保留 ü (默认值) - * @value true:返回值中 ü 转换成 v - * @value string:返回值中 ü 转换成指定字符 - */ - v?: boolean | string; } const DefaultHtmlOptions: HtmlOptions = { @@ -63,6 +52,7 @@ const DefaultHtmlOptions: HtmlOptions = { toneSandhi: true, rp: true, v: false, + traditional: false, }; /** @@ -78,9 +68,7 @@ export const html = (text: string, options?: HtmlOptions) => { } as Required; const pinyinArray = pinyin(text, { type: "all", - toneType: completeOptions.toneType, - toneSandhi: options?.toneSandhi, - v: completeOptions.v, + ...completeOptions, }); const result = pinyinArray.map((item) => { let additionalClass = ""; diff --git a/lib/core/pinyin/handle.ts b/lib/core/pinyin/handle.ts index 7a0e3a3..58fe23a 100644 --- a/lib/core/pinyin/handle.ts +++ b/lib/core/pinyin/handle.ts @@ -18,6 +18,7 @@ import { } from "../../common/segmentit"; import { Priority } from "@/common/constant"; import { splitString } from "@/common/utils"; +import { getTraditionalDict } from "../traditional"; /** * @description: 获取单个字符的拼音 @@ -31,13 +32,30 @@ export const getSingleWordPinyin: GetSingleWordPinyin = (char) => { return pinyin ? pinyin.split(" ")[0] : char; }; +const getTraditionalWords = (word: string): string => { + const traditionalWords: string[] = []; + const traditionalDict = getTraditionalDict(); + for (let i = 0; i < word.length; i++) { + const key = word[i]; + const code = key.charCodeAt(0); + if (traditionalDict[code]) { + traditionalWords[i] = traditionalDict[code]; + } else { + traditionalWords[i] = key; + } + } + return traditionalWords.join(""); +}; + export const getPinyin = ( word: string, list: SingleWordResult[], surname: SurnameMode, - segmentit: TokenizationAlgorithm + segmentit: TokenizationAlgorithm, + traditional?: boolean, ): { list: SingleWordResult[]; matches: MatchPattern[] } => { - const matches = acTree.search(word, surname, segmentit); + const searchWord = traditional ? getTraditionalWords(word) : word; + const matches = acTree.search(searchWord, surname, segmentit); let matchIndex = 0; const zhChars = splitString(word); for (let i = 0; i < zhChars.length; ) { @@ -45,6 +63,7 @@ export const getPinyin = ( if (match && i === match.index) { if (match.length === 1 && match.priority <= Priority.Normal) { const char = zhChars[i]; + match.zh = char; let pinyin: string = ""; pinyin = processSepecialPinyin(char, zhChars[i - 1], zhChars[i + 1]); list[i] = { @@ -59,10 +78,12 @@ export const getPinyin = ( } const pinyins = match.pinyin.split(" "); let pinyinIndex = 0; + if (traditional) { + match.zh = zhChars.slice(match.index, match.index + match.length).join(""); + } for (let j = 0; j < match.length; j++) { - const zhChars = splitString(match.zh); list[i + j] = { - origin: zhChars[j], + origin: zhChars[j + match.index], result: pinyins[pinyinIndex] || "", isZh: true, originPinyin: pinyins[pinyinIndex] || "", @@ -121,7 +142,7 @@ export const getAllPinyin: GetAllPinyin = (char, surname = "off") => { const surnamePinyin = Surnames[char]; if (surnamePinyin) { pinyin = [surnamePinyin].concat( - pinyin.filter((py) => py !== surnamePinyin) + pinyin.filter((py) => py !== surnamePinyin), ); } } @@ -135,7 +156,7 @@ export const getAllPinyin: GetAllPinyin = (char, surname = "off") => { */ type GetMultiplePinyin = ( word: string, - surname?: SurnameMode + surname?: SurnameMode, ) => SingleWordResult[]; const getMultiplePinyin: GetMultiplePinyin = (word, surname = "off") => { let pinyin = getAllPinyin(word, surname); @@ -166,7 +187,7 @@ const getMultiplePinyin: GetMultiplePinyin = (word, surname = "off") => { */ type GetInitialAndFinal = ( pinyin: string, - initialPattern?: InitialPattern + initialPattern?: InitialPattern, ) => { final: string; initial: string; diff --git a/lib/core/pinyin/index.ts b/lib/core/pinyin/index.ts index dc630c5..062716c 100644 --- a/lib/core/pinyin/index.ts +++ b/lib/core/pinyin/index.ts @@ -163,7 +163,7 @@ function pinyin(word: string, options?: OptionsReturnAll): AllData[]; */ function pinyin( word: string, - options?: CompleteOptions + options?: CompleteOptions, ): string | string[] | AllData[] { options = { ...DEFAULT_OPTIONS, ...(options || {}) }; // 校验 word 类型是否正确 @@ -203,7 +203,8 @@ function pinyin( word, _list, options.surname as SurnameMode, - options.segmentit as TokenizationAlgorithm + options.segmentit as TokenizationAlgorithm, + options.traditional, ); // 一和不变调处理 diff --git a/lib/core/segment/index.ts b/lib/core/segment/index.ts index 93122d2..e40a8b0 100644 --- a/lib/core/segment/index.ts +++ b/lib/core/segment/index.ts @@ -1,4 +1,4 @@ -import { BasicOptions } from "../pinyin"; +import type { BasicOptions } from "../pinyin"; import { TokenizationAlgorithm } from "../../common/segmentit"; import { stringLength } from "@/common/utils"; import { middleWareNonZh, middlewareToneSandhi, middlewareToneType, middlewareV, validateType } from "@/core/pinyin/middlewares"; @@ -8,7 +8,7 @@ import { middlewareOutputFormat, middlewareSegment, Output, OutputFormat } from type SegmentBaseOptions = Pick< BasicOptions, - "toneType" | "mode" | "surname" | "nonZh" | "v" | "toneSandhi" | "segmentit" + "toneType" | "mode" | "surname" | "nonZh" | "v" | "toneSandhi" | "segmentit" | "traditional" >; interface AllSegmentReturnOptions extends SegmentBaseOptions { @@ -103,6 +103,7 @@ const DEFAULT_OPTIONS: SegmentCompleteOptions = { toneSandhi: true, segmentit: TokenizationAlgorithm.MaxProbability, format: OutputFormat.AllSegment, + traditional: false, }; export function segment(word: string, options?: AllSegmentReturnOptions): Output['AllSegment']; @@ -138,7 +139,8 @@ export function segment(word: string, options?: SegmentCompleteOptions) { word, _list, options.surname as SurnameMode, - options.segmentit as TokenizationAlgorithm + options.segmentit as TokenizationAlgorithm, + options.traditional as boolean ); // 一和不变调处理 diff --git a/lib/core/traditional/index.ts b/lib/core/traditional/index.ts new file mode 100644 index 0000000..ab4fdcb --- /dev/null +++ b/lib/core/traditional/index.ts @@ -0,0 +1,13 @@ +const traditionalDict: string[] = []; + +export function addTraditionalDict(dict: Record) { + for (let key in dict) { + const value = dict[key]; + const code = key.charCodeAt(0); + traditionalDict[code] = value; + } +} + +export function getTraditionalDict() { + return traditionalDict; +} diff --git a/lib/index.ts b/lib/index.ts index db88293..65541c2 100644 --- a/lib/index.ts +++ b/lib/index.ts @@ -1,9 +1,14 @@ -export { getInitialAndFinal, getFinalParts, getNumOfTone } from './core/pinyin/handle'; -export { pinyin } from './core/pinyin'; -export { customPinyin, clearCustomDict } from './core/custom'; -export { addDict, removeDict } from './core/dict'; -export { match } from './core/match'; -export { html } from './core/html'; -export { polyphonic } from './core/polyphonic'; -export { convert } from './core/convert'; -export { segment, OutputFormat } from './core/segment'; \ No newline at end of file +export { + getInitialAndFinal, + getFinalParts, + getNumOfTone, +} from "./core/pinyin/handle"; +export { pinyin } from "./core/pinyin"; +export { customPinyin, clearCustomDict } from "./core/custom"; +export { addDict, removeDict } from "./core/dict"; +export { match } from "./core/match"; +export { html } from "./core/html"; +export { polyphonic } from "./core/polyphonic"; +export { convert } from "./core/convert"; +export { segment, OutputFormat } from "./core/segment"; +export { addTraditionalDict, getTraditionalDict } from "./core/traditional"; diff --git a/package.json b/package.json index 0ce9051..a21fdc4 100644 --- a/package.json +++ b/package.json @@ -54,7 +54,7 @@ "devDependencies": { "@commitlint/cli": "^11.0.0", "@commitlint/config-conventional": "^11.0.0", - "@pinyin-pro/data": "1.0.3", + "@pinyin-pro/data": "1.3.0", "@rollup/plugin-commonjs": "^17.1.0", "@rollup/plugin-json": "^4.1.0", "@rollup/plugin-node-resolve": "^11.2.0", diff --git a/test/traditional.test.js b/test/traditional.test.js new file mode 100644 index 0000000..44177e9 --- /dev/null +++ b/test/traditional.test.js @@ -0,0 +1,69 @@ +import { pinyin, html, addTraditionalDict, segment } from '../lib/index'; +import traditionalDict from '@pinyin-pro/data/traditional' +import { expect, describe, it } from 'vitest'; + +describe("without traditional", () => { + it("[pinyin traditional]轉盤", () => { + const result = pinyin("轉盤"); + expect(result).to.be.equal("zhuǎn pán"); + }); + + it("[segment traditional]一个轉盤", () => { + const result = segment("一个轉盤"); + expect(result).to.deep.equal([ + { + "origin": "一", + "result": "yí", + }, + { + "origin": "个", + "result": "gè", + }, + { + "origin": "轉", + "result": "zhuǎn", + }, + { + "origin": "盤", + "result": "pán", + }, + ]); + }); + + it("[html traditional]轉盤", () => { + const result = html("轉盤"); + expect(result).to.be.equal('(zhuǎn)(pán)'); + }); +}); + + +describe("with traditional", () => { + addTraditionalDict(traditionalDict); + it("[pinyin with traditional]一个🌛轉盤", () => { + const result = pinyin("一个🌛轉盤", { traditional: true }); + expect(result).to.be.equal("yí gè 🌛 zhuàn pán"); + }); + + it("[segment traditional]一个轉盤", () => { + const result = segment("一个轉盤", { traditional: true }); + expect(result).to.deep.equal([ + { + "origin": "一", + "result": "yí", + }, + { + "origin": "个", + "result": "gè", + }, + { + "origin": "轉盤", + "result": "zhuànpán", + }, + ]); + }); + + it("[html with traditional]轉盤", () => { + const result = html("轉盤", { traditional: true }); + expect(result).to.be.equal('(zhuàn)(pán)'); + }); +}); diff --git a/types/common/type.d.ts b/types/common/type.d.ts index 12740aa..6906f7c 100644 --- a/types/common/type.d.ts +++ b/types/common/type.d.ts @@ -4,6 +4,7 @@ export interface SingleWordResult { result: string; isZh: boolean; delete?: boolean; + traditional?: string; } export type ToneType = "symbol" | "num" | "none"; export type PinyinMode = "normal" | "surname"; @@ -59,4 +60,10 @@ export type CommonOptions = { * @value standard:不将 `y`、`w` 视为声母 */ initialPattern?: InitialPattern; + /** + * @description 是否启用繁体字模式(可以更好地识别繁体字) + * @value false:不启用繁体字模式 (默认值) + * @value true:启用繁体字模式 + */ + traditional?: boolean; }; diff --git a/types/core/html/index.d.ts b/types/core/html/index.d.ts index ffbfddf..22f7126 100644 --- a/types/core/html/index.d.ts +++ b/types/core/html/index.d.ts @@ -1,4 +1,6 @@ -interface HtmlOptions { +import type { BasicOptions } from "../../core/pinyin"; +type HtmlBaseOptions = Pick; +interface HtmlOptions extends HtmlBaseOptions { /** * @description html 结果中每个字+拼音外层 span 标签的类名。默认为 py-result-item */ @@ -19,35 +21,18 @@ interface HtmlOptions { * @description html 非汉字字符外层 span 标签的类名,仅当 wrapNonChinese 为 true 时生效。默认为 py-non-chinese-item */ nonChineseClass?: string; - /** - * @description 拼音上是否标注音调 - */ - toneType?: "symbol" | "num" | "none"; /** * @description 对于指定的汉字及字符,在 result 上额外补充的拼音 */ customClassMap?: { [classname: string]: string[]; }; - /** - * @description 是否开启「一」和 「不」字的变调。默认开启。参考:https://zh.wiktionary.org/wiki/Appendix:%E2%80%9C%E4%B8%80%E2%80%9D%E5%8F%8A%E2%80%9C%E4%B8%8D%E2%80%9D%E7%9A%84%E5%8F%98%E8%B0%83 - * @value true:开启 - * @value false:不开启 - */ - toneSandhi?: boolean; /** * @description 是否保留 ( 标签,默认为保留 * @value true:保留 ( * @value false:移除 ( */ rp?: boolean; - /** - * @description 对于 ü 的返回是否转换成 v(仅在 toneType: none 启用时生效) - * @value false:返回值中保留 ü (默认值) - * @value true:返回值中 ü 转换成 v - * @value string:返回值中 ü 转换成指定字符 - */ - v?: boolean | string; } /** * @description: 获取带拼音汉字的 html 字符串 diff --git a/types/core/pinyin/handle.d.ts b/types/core/pinyin/handle.d.ts index c1f4b9d..5da30b1 100644 --- a/types/core/pinyin/handle.d.ts +++ b/types/core/pinyin/handle.d.ts @@ -8,7 +8,7 @@ import { MatchPattern, TokenizationAlgorithm } from "../../common/segmentit"; */ type GetSingleWordPinyin = (char: string) => string; export declare const getSingleWordPinyin: GetSingleWordPinyin; -export declare const getPinyin: (word: string, list: SingleWordResult[], surname: SurnameMode, segmentit: TokenizationAlgorithm) => { +export declare const getPinyin: (word: string, list: SingleWordResult[], surname: SurnameMode, segmentit: TokenizationAlgorithm, traditional?: boolean) => { list: SingleWordResult[]; matches: MatchPattern[]; }; diff --git a/types/core/segment/index.d.ts b/types/core/segment/index.d.ts index e8f0524..9065b68 100644 --- a/types/core/segment/index.d.ts +++ b/types/core/segment/index.d.ts @@ -1,6 +1,6 @@ -import { BasicOptions } from "../pinyin"; +import type { BasicOptions } from "../pinyin"; import { Output, OutputFormat } from "./middlewares"; -type SegmentBaseOptions = Pick; +type SegmentBaseOptions = Pick; interface AllSegmentReturnOptions extends SegmentBaseOptions { /** * @description 以片段格式返回全部信息 diff --git a/types/core/traditional/index.d.ts b/types/core/traditional/index.d.ts new file mode 100644 index 0000000..e6f3fdc --- /dev/null +++ b/types/core/traditional/index.d.ts @@ -0,0 +1,2 @@ +export declare function addTraditionalDict(dict: Record): void; +export declare function getTraditionalDict(): string[]; diff --git a/types/data/traditional-to-simplified.d.ts b/types/data/traditional-to-simplified.d.ts new file mode 100644 index 0000000..5702477 --- /dev/null +++ b/types/data/traditional-to-simplified.d.ts @@ -0,0 +1,2 @@ +declare const TraditionalListMap: string[]; +export { TraditionalListMap }; diff --git a/types/index.d.ts b/types/index.d.ts index 51e49f7..adae192 100644 --- a/types/index.d.ts +++ b/types/index.d.ts @@ -1,9 +1,10 @@ -export { getInitialAndFinal, getFinalParts, getNumOfTone } from './core/pinyin/handle'; -export { pinyin } from './core/pinyin'; -export { customPinyin, clearCustomDict } from './core/custom'; -export { addDict, removeDict } from './core/dict'; -export { match } from './core/match'; -export { html } from './core/html'; -export { polyphonic } from './core/polyphonic'; -export { convert } from './core/convert'; -export { segment, OutputFormat } from './core/segment'; +export { getInitialAndFinal, getFinalParts, getNumOfTone, } from "./core/pinyin/handle"; +export { pinyin } from "./core/pinyin"; +export { customPinyin, clearCustomDict } from "./core/custom"; +export { addDict, removeDict } from "./core/dict"; +export { match } from "./core/match"; +export { html } from "./core/html"; +export { polyphonic } from "./core/polyphonic"; +export { convert } from "./core/convert"; +export { segment, OutputFormat } from "./core/segment"; +export { addTraditionalDict, getTraditionalDict } from "./core/traditional";