mirror of
https://github.com/skishore/makemeahanzi.git
synced 2025-10-30 02:18:16 +08:00
Add high-recall phonetic-semantic inference
This commit is contained in:
@ -23,3 +23,4 @@ meteorhacks:npm
|
||||
npm-container
|
||||
less
|
||||
rubaxa:sortable
|
||||
underscorestring:underscore.string
|
||||
|
||||
@ -65,6 +65,7 @@ templating-tools@1.0.0
|
||||
tracker@1.0.9
|
||||
ui@1.0.8
|
||||
underscore@1.0.4
|
||||
underscorestring:underscore.string@3.2.2
|
||||
url@1.0.5
|
||||
webapp@1.2.2
|
||||
webapp-hashing@1.0.5
|
||||
|
||||
@ -98,7 +98,8 @@ const initializeRadical = (character, components) => {
|
||||
|
||||
const initializeEtymology = (glyph, components) => {
|
||||
const data = cjklib.getCharacterData(glyph.character);
|
||||
const target = pinyin_util.dropTones(glyph.pinyin || data.pinyin || '');
|
||||
const target = pinyin_util.dropTones(
|
||||
glyph.metadata.pinyin || data.pinyin || '');
|
||||
const phonetic_match = (component) => {
|
||||
const component_data = cjklib.getCharacterData(component);
|
||||
const attempt = pinyin_util.dropTones(component_data.pinyin || '');
|
||||
@ -116,6 +117,34 @@ const initializeEtymology = (glyph, components) => {
|
||||
return {type: 'ideographic'};
|
||||
}
|
||||
|
||||
// Methods for automatically inferring a phonetic-semantic decomposition.
|
||||
|
||||
const doubleAlphabeticCharacters = (pinyin) => {
|
||||
const numbered = pinyin_util.tonePinyinToNumberedPinyin(pinyin);
|
||||
return Array.from(numbered).map((x) => /[a-z]/.test(x) ? x + x : x).join('');
|
||||
}
|
||||
|
||||
const guessPhoneticAndSemanticComponents = (glyph, components) => {
|
||||
const data = cjklib.getCharacterData(glyph.character);
|
||||
const target = doubleAlphabeticCharacters(
|
||||
glyph.metadata.pinyin || data.pinyin || '');
|
||||
const distance = (component) => {
|
||||
const component_data = cjklib.getCharacterData(component);
|
||||
const attempt = doubleAlphabeticCharacters(component_data.pinyin || '');
|
||||
return s.levenshtein(attempt, target);
|
||||
}
|
||||
const pairs = components.map((x) => [x, distance(x)]);
|
||||
const sorted = pairs.sort((a, b) => a[1] - b[1]).map((x) => x[0]);
|
||||
const result = {};
|
||||
if (sorted.length > 0) {
|
||||
result.phonetic = sorted[0];
|
||||
if (sorted.length === 2) {
|
||||
result.semantic = sorted[1];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
stages.analysis = class AnalysisStage extends stages.AbstractStage {
|
||||
constructor(glyph) {
|
||||
super('analysis');
|
||||
@ -185,6 +214,10 @@ Template.analysis_stage.events({
|
||||
delete stage.etymology.hint;
|
||||
}
|
||||
stage.etymology.type = type;
|
||||
if (type === 'pictophonetic') {
|
||||
_.extend(stage.etymology, guessPhoneticAndSemanticComponents(
|
||||
Session.get('editor.glyph'), collectComponents(stage.tree)));
|
||||
}
|
||||
stage.forceRefresh();
|
||||
},
|
||||
'change .subtree-type': function(event) {
|
||||
|
||||
@ -21,13 +21,16 @@ const vowels = tokenSet('a ai an ang ao e ei en eng er i ia ian iang iao ie ' +
|
||||
const two_syllables = tokenSet('ia ian iang iao ie io iong iu ua uai uan ' +
|
||||
'uang ue ui uo van');
|
||||
|
||||
pinyin_util.dropTones = (pinyin) => {
|
||||
pinyin_util.dropTones = (pinyin, append_number) => {
|
||||
for (let i = 0; i < pinyin.length; i++) {
|
||||
for (let option = 1; option <= 4; option++) {
|
||||
const index = vowel_to_tone[option].indexOf(pinyin[i]);
|
||||
if (index >= 0) {
|
||||
const toneless = 'aeiouv'[index];
|
||||
pinyin = pinyin.substr(0, i) + toneless + pinyin.substr(i + 1);
|
||||
if (append_number) {
|
||||
return `${pinyin}${option}`;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -72,3 +75,7 @@ pinyin_util.numberedPinyinToTonePinyin = (numbered) => {
|
||||
}
|
||||
return consonant + vowel.replace('v', 'ü');
|
||||
}
|
||||
|
||||
pinyin_util.tonePinyinToNumberedPinyin = (tone) => {
|
||||
return pinyin_util.dropTones(tone, true /* append_number */);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user