Add high-recall phonetic-semantic inference

This commit is contained in:
Shaunak Kishore
2015-11-07 02:53:04 -05:00
parent a646d34f3c
commit 57cf6ce948
4 changed files with 44 additions and 2 deletions

View File

@ -23,3 +23,4 @@ meteorhacks:npm
npm-container
less
rubaxa:sortable
underscorestring:underscore.string

View File

@ -65,6 +65,7 @@ templating-tools@1.0.0
tracker@1.0.9
ui@1.0.8
underscore@1.0.4
underscorestring:underscore.string@3.2.2
url@1.0.5
webapp@1.2.2
webapp-hashing@1.0.5

View File

@ -98,7 +98,8 @@ const initializeRadical = (character, components) => {
const initializeEtymology = (glyph, components) => {
const data = cjklib.getCharacterData(glyph.character);
const target = pinyin_util.dropTones(glyph.pinyin || data.pinyin || '');
const target = pinyin_util.dropTones(
glyph.metadata.pinyin || data.pinyin || '');
const phonetic_match = (component) => {
const component_data = cjklib.getCharacterData(component);
const attempt = pinyin_util.dropTones(component_data.pinyin || '');
@ -116,6 +117,34 @@ const initializeEtymology = (glyph, components) => {
return {type: 'ideographic'};
}
// Methods for automatically inferring a phonetic-semantic decomposition.
const doubleAlphabeticCharacters = (pinyin) => {
const numbered = pinyin_util.tonePinyinToNumberedPinyin(pinyin);
return Array.from(numbered).map((x) => /[a-z]/.test(x) ? x + x : x).join('');
}
const guessPhoneticAndSemanticComponents = (glyph, components) => {
const data = cjklib.getCharacterData(glyph.character);
const target = doubleAlphabeticCharacters(
glyph.metadata.pinyin || data.pinyin || '');
const distance = (component) => {
const component_data = cjklib.getCharacterData(component);
const attempt = doubleAlphabeticCharacters(component_data.pinyin || '');
return s.levenshtein(attempt, target);
}
const pairs = components.map((x) => [x, distance(x)]);
const sorted = pairs.sort((a, b) => a[1] - b[1]).map((x) => x[0]);
const result = {};
if (sorted.length > 0) {
result.phonetic = sorted[0];
if (sorted.length === 2) {
result.semantic = sorted[1];
}
}
return result;
}
stages.analysis = class AnalysisStage extends stages.AbstractStage {
constructor(glyph) {
super('analysis');
@ -185,6 +214,10 @@ Template.analysis_stage.events({
delete stage.etymology.hint;
}
stage.etymology.type = type;
if (type === 'pictophonetic') {
_.extend(stage.etymology, guessPhoneticAndSemanticComponents(
Session.get('editor.glyph'), collectComponents(stage.tree)));
}
stage.forceRefresh();
},
'change .subtree-type': function(event) {

View File

@ -21,13 +21,16 @@ const vowels = tokenSet('a ai an ang ao e ei en eng er i ia ian iang iao ie ' +
const two_syllables = tokenSet('ia ian iang iao ie io iong iu ua uai uan ' +
'uang ue ui uo van');
pinyin_util.dropTones = (pinyin) => {
pinyin_util.dropTones = (pinyin, append_number) => {
for (let i = 0; i < pinyin.length; i++) {
for (let option = 1; option <= 4; option++) {
const index = vowel_to_tone[option].indexOf(pinyin[i]);
if (index >= 0) {
const toneless = 'aeiouv'[index];
pinyin = pinyin.substr(0, i) + toneless + pinyin.substr(i + 1);
if (append_number) {
return `${pinyin}${option}`;
}
}
}
}
@ -72,3 +75,7 @@ pinyin_util.numberedPinyinToTonePinyin = (numbered) => {
}
return consonant + vowel.replace('v', 'ü');
}
pinyin_util.tonePinyinToNumberedPinyin = (tone) => {
return pinyin_util.dropTones(tone, true /* append_number */);
}