mirror of
https://github.com/skishore/makemeahanzi.git
synced 2025-10-26 20:15:11 +08:00
60 lines
1.9 KiB
Python
60 lines
1.9 KiB
Python
# -*- coding: utf-8 -*-
|
|
import json
|
|
|
|
|
|
def run_left_right_analysis(characters):
|
|
total = 0
|
|
right = 0
|
|
left = 0
|
|
for data in characters.itervalues():
|
|
if 'decomposition' not in data or 'etymology' not in data:
|
|
continue
|
|
(decomposition, etymology) = (data['decomposition'], data['etymology'])
|
|
if etymology['type'] != 'pictophonetic':
|
|
continue
|
|
if decomposition[0] != u'⿰' or len(decomposition) != 3:
|
|
continue
|
|
total += 1
|
|
phonetic = etymology.get('phonetic')
|
|
if phonetic == decomposition[1]:
|
|
left += 1
|
|
if phonetic == decomposition[2]:
|
|
right += 1
|
|
print '(total, left, right):', (total, left, right)
|
|
|
|
|
|
def run_stroke_count_analysis(characters):
|
|
total = 0
|
|
counts = [0, 0, 0]
|
|
phonetic_stroke_total = 0
|
|
semantic_stroke_total = 0
|
|
for data in characters.itervalues():
|
|
etymology = data.get('etymology', {})
|
|
if 'phonetic' not in etymology or 'semantic' not in etymology:
|
|
continue
|
|
(phonetic, semantic) = (etymology['phonetic'], etymology['semantic'])
|
|
if phonetic not in characters or semantic not in characters:
|
|
continue
|
|
total += 1
|
|
phonetic_strokes = len(characters[phonetic]['matches'])
|
|
semantic_strokes = len(characters[semantic]['matches'])
|
|
phonetic_stroke_total += phonetic_strokes
|
|
semantic_stroke_total += semantic_strokes
|
|
counts[cmp(phonetic_strokes, semantic_strokes) + 1] += 1
|
|
mean = lambda x: 1.0 * x / total
|
|
print '(total, counts, phonetic_mean, semantic_mean):', (
|
|
total, map(mean, counts),
|
|
mean(phonetic_stroke_total), mean(semantic_stroke_total))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
characters = {}
|
|
with open('dictionary.txt') as f:
|
|
for line in f.xreadlines():
|
|
if not line:
|
|
continue
|
|
data = json.loads(line.strip())
|
|
characters[data['character']] = data
|
|
run_left_right_analysis(characters)
|
|
run_stroke_count_analysis(characters)
|