diff --git a/stats.py b/stats.py new file mode 100644 index 00000000..c1ded8b5 --- /dev/null +++ b/stats.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- +import json + + +def run_left_right_analysis(characters): + total = 0 + right = 0 + left = 0 + for data in characters.itervalues(): + if 'decomposition' not in data or 'etymology' not in data: + continue + (decomposition, etymology) = (data['decomposition'], data['etymology']) + if etymology['type'] != 'pictophonetic': + continue + if decomposition[0] != u'⿰' or len(decomposition) != 3: + continue + total += 1 + phonetic = etymology.get('phonetic') + if phonetic == decomposition[1]: + left += 1 + if phonetic == decomposition[2]: + right += 1 + print '(total, left, right):', (total, left, right) + + +def run_stroke_count_analysis(characters): + total = 0 + counts = [0, 0, 0] + phonetic_stroke_total = 0 + semantic_stroke_total = 0 + for data in characters.itervalues(): + etymology = data.get('etymology', {}) + if 'phonetic' not in etymology or 'semantic' not in etymology: + continue + (phonetic, semantic) = (etymology['phonetic'], etymology['semantic']) + if phonetic not in characters or semantic not in characters: + continue + total += 1 + phonetic_strokes = len(characters[phonetic]['matches']) + semantic_strokes = len(characters[semantic]['matches']) + phonetic_stroke_total += phonetic_strokes + semantic_stroke_total += semantic_strokes + counts[cmp(phonetic_strokes, semantic_strokes) + 1] += 1 + mean = lambda x: 1.0 * x / total + print '(total, counts, phonetic_mean, semantic_mean):', ( + total, map(mean, counts), + mean(phonetic_stroke_total), mean(semantic_stroke_total)) + + +if __name__ == '__main__': + characters = {} + with open('dictionary.txt') as f: + for line in f.xreadlines(): + if not line: + continue + data = json.loads(line.strip()) + characters[data['character']] = data + run_left_right_analysis(characters) + run_stroke_count_analysis(characters)