Add high-recall phonetic-semantic inference

2025-10-30 02:18:16 +08:00 · 2015-11-07 02:53:04 -05:00
parent a646d34f3c
commit 57cf6ce948
4 changed files with 44 additions and 2 deletions
--- a/.meteor/packages
+++ b/.meteor/packages
@ -23,3 +23,4 @@ meteorhacks:npm
 npm-container
 less
 rubaxa:sortable
+underscorestring:underscore.string
--- a/.meteor/versions
+++ b/.meteor/versions
@ -65,6 +65,7 @@ templating-tools@1.0.0
 tracker@1.0.9
 ui@1.0.8
 underscore@1.0.4
+underscorestring:underscore.string@3.2.2
 url@1.0.5
 webapp@1.2.2
 webapp-hashing@1.0.5
--- a/client/lib/analysis.js
+++ b/client/lib/analysis.js
@ -98,7 +98,8 @@ const initializeRadical = (character, components) => {

 const initializeEtymology = (glyph, components) => {
  const data = cjklib.getCharacterData(glyph.character);
-  const target = pinyin_util.dropTones(glyph.pinyin || data.pinyin || '');
+  const target = pinyin_util.dropTones(
+      glyph.metadata.pinyin || data.pinyin || '');
  const phonetic_match = (component) => {
    const component_data = cjklib.getCharacterData(component);
    const attempt = pinyin_util.dropTones(component_data.pinyin || '');
@ -116,6 +117,34 @@ const initializeEtymology = (glyph, components) => {
  return {type: 'ideographic'};
 }

+// Methods for automatically inferring a phonetic-semantic decomposition.
+
+const doubleAlphabeticCharacters = (pinyin) => {
+  const numbered = pinyin_util.tonePinyinToNumberedPinyin(pinyin);
+  return Array.from(numbered).map((x) => /[a-z]/.test(x) ? x + x : x).join('');
+}
+
+const guessPhoneticAndSemanticComponents = (glyph, components) => {
+  const data = cjklib.getCharacterData(glyph.character);
+  const target = doubleAlphabeticCharacters(
+      glyph.metadata.pinyin || data.pinyin || '');
+  const distance = (component) => {
+    const component_data = cjklib.getCharacterData(component);
+    const attempt = doubleAlphabeticCharacters(component_data.pinyin || '');
+    return s.levenshtein(attempt, target);
+  }
+  const pairs = components.map((x) => [x, distance(x)]);
+  const sorted = pairs.sort((a, b) => a[1] - b[1]).map((x) => x[0]);
+  const result = {};
+  if (sorted.length > 0) {
+    result.phonetic = sorted[0];
+    if (sorted.length === 2) {
+      result.semantic = sorted[1];
+    }
+  }
+  return result;
+}
+
 stages.analysis = class AnalysisStage extends stages.AbstractStage {
  constructor(glyph) {
    super('analysis');
@ -185,6 +214,10 @@ Template.analysis_stage.events({
      delete stage.etymology.hint;
    }
    stage.etymology.type = type;
+    if (type === 'pictophonetic') {
+      _.extend(stage.etymology, guessPhoneticAndSemanticComponents(
+          Session.get('editor.glyph'), collectComponents(stage.tree)));
+    }
    stage.forceRefresh();
  },
  'change .subtree-type': function(event) {
--- a/lib/pinyin_util.js
+++ b/lib/pinyin_util.js
@ -21,13 +21,16 @@ const vowels = tokenSet('a ai an ang ao e ei en eng er i ia ian iang iao ie ' +
 const two_syllables = tokenSet('ia ian iang iao ie io iong iu ua uai uan ' +
                               'uang ue ui uo van');

-pinyin_util.dropTones = (pinyin) => {
+pinyin_util.dropTones = (pinyin, append_number) => {
  for (let i = 0; i < pinyin.length; i++) {
    for (let option = 1; option <= 4; option++) {
      const index = vowel_to_tone[option].indexOf(pinyin[i]);
      if (index >= 0) {
        const toneless = 'aeiouv'[index];
        pinyin = pinyin.substr(0, i) + toneless + pinyin.substr(i + 1);
+        if (append_number) {
+          return `${pinyin}${option}`;
+        }
      }
    }
  }
@ -72,3 +75,7 @@ pinyin_util.numberedPinyinToTonePinyin = (numbered) => {
  }
  return consonant + vowel.replace('v', 'ü');
 }
+
+pinyin_util.tonePinyinToNumberedPinyin = (tone) => {
+  return pinyin_util.dropTones(tone, true /* append_number */);
+}