Files
makemeahanzi/stats.py
2016-02-18 12:00:20 -05:00

60 lines
1.9 KiB
Python

# -*- coding: utf-8 -*-
import json
def run_left_right_analysis(characters):
total = 0
right = 0
left = 0
for data in characters.itervalues():
if 'decomposition' not in data or 'etymology' not in data:
continue
(decomposition, etymology) = (data['decomposition'], data['etymology'])
if etymology['type'] != 'pictophonetic':
continue
if decomposition[0] != u'' or len(decomposition) != 3:
continue
total += 1
phonetic = etymology.get('phonetic')
if phonetic == decomposition[1]:
left += 1
if phonetic == decomposition[2]:
right += 1
print '(total, left, right):', (total, left, right)
def run_stroke_count_analysis(characters):
total = 0
counts = [0, 0, 0]
phonetic_stroke_total = 0
semantic_stroke_total = 0
for data in characters.itervalues():
etymology = data.get('etymology', {})
if 'phonetic' not in etymology or 'semantic' not in etymology:
continue
(phonetic, semantic) = (etymology['phonetic'], etymology['semantic'])
if phonetic not in characters or semantic not in characters:
continue
total += 1
phonetic_strokes = len(characters[phonetic]['matches'])
semantic_strokes = len(characters[semantic]['matches'])
phonetic_stroke_total += phonetic_strokes
semantic_stroke_total += semantic_strokes
counts[cmp(phonetic_strokes, semantic_strokes) + 1] += 1
mean = lambda x: 1.0 * x / total
print '(total, counts, phonetic_mean, semantic_mean):', (
total, map(mean, counts),
mean(phonetic_stroke_total), mean(semantic_stroke_total))
if __name__ == '__main__':
characters = {}
with open('dictionary.txt') as f:
for line in f.xreadlines():
if not line:
continue
data = json.loads(line.strip())
characters[data['character']] = data
run_left_right_analysis(characters)
run_stroke_count_analysis(characters)