Files
makemeahanzi/scripts/analysis.py
2015-09-23 02:10:58 -04:00

147 lines
5.2 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/python
# -*- coding: utf-8 -*-
import cjklib.characterlookup
import cjklib.exception
import collections
cjk = cjklib.characterlookup.CharacterLookup('C')
def MutableNamedTuple(name, fields):
def tostr(value):
if type(value) == unicode:
return "'%s'" % (value.encode('utf8'))
return repr(value)
class TemporaryClass(object):
__name__ = name
def __init__(self, *args):
assert(len(args) == len(fields))
for (key, value) in zip(fields, args):
self.__dict__[key] = value
def __str__(self):
return '%s(%s)' % (name, ', '.join(
tostr(self.__dict__[key]) for key in fields))
return TemporaryClass
def in_cjk_block(character):
if not (len(character) == 1 and 0x4e00 <= ord(character) <= 0x9fff):
print '%s is U+%s' % (character, hex(ord(character))[2:].upper())
return False
return True
with open('scripts/glyphs') as f:
glyphs = f.readlines()[0].strip().decode('utf8')
assert(all(in_cjk_block(glyph) for glyph in glyphs))
glyph_set = set(glyphs)
assert(len(glyphs) == len(glyph_set) == 6763)
Radical = MutableNamedTuple('Radical', ['number', 'character', 'definition',
'pinyin', 'traditional', 'variants'])
with open('scripts/radicals') as f:
rows = [line[:-1].decode('utf8').split('\t') for line in f.readlines()[1:]]
radicals = [Radical(*row) for row in rows]
assert(len(radicals) == 216)
print 'Homogenizing derived radicals:'
for radical in radicals:
radical.number = int(radical.number)
radical.traditional = radical.traditional or None
radical.variants = \
tuple(sorted(radical.variants.split(','))) if radical.variants else ()
in_cjk_block(radical.character)
if radical.traditional is not None:
in_cjk_block(radical.traditional)
[in_cjk_block(variant) for variant in radical.variants]
assert(radical.definition)
assert(radical.pinyin)
radical_map = dict((radical.character, radical) for radical in radicals)
assert(len(radical_map) == len(radicals))
for radical in radicals:
if radical.traditional:
assert(radical.traditional not in radical_map)
radical_map[radical.traditional] = radical
for variant in radical.variants:
assert(variant not in radical_map), variant.encode('utf8')
radical_map[variant] = radical
print 'Got %s radicals, including variants.' % (len(radical_map),)
radicals_used = set()
def decomposition_to_str(decomposition):
result = ''
for element in decomposition:
if type(element) == unicode:
result += element.encode('utf8')
else:
result += element[0].encode('utf8')
if element[1] != 0:
result += '[%s]' % (element[1],)
return result
print 'Checking decompositions:'
extra_glyphs = set()
counts = collections.defaultdict(int)
decompositions = cjk.getDecompositionEntriesDict()
for glyph in glyphs:
index = cjk.getDefaultGlyph(glyph)
if glyph in radical_map or (glyph, index) not in decompositions:
counts['indecomposable'] += 1
if glyph in radical_map:
radicals_used.add(glyph)
continue
decomposition = decompositions[(glyph, index)][0]
for element in decomposition:
if type(element) == unicode:
continue
(part, index) = element
# If the index is non-zero here, the decomposition includes a variant of
# the given part instead of the part itself. However, by inspection, the
# few hundred or so times this occurs in the 6763 characters in GB2312 seem
# largely benign; most of the time, the character is simply stretched
# horizontally or vertically.
#if index != 0:
# print 'In %s, got index %s for part %s' % (glyph, index, part)
if part in radical_map:
radicals_used.add(part)
continue
if part == u'':
counts['unknown'] += 1
continue
if part not in glyphs and part not in extra_glyphs:
assert part != u''
if cjk.isRadicalChar(part):
try:
equivalent_character = cjk.getRadicalFormEquivalentCharacter(part)
except cjklib.exception.UnsupportedError:
equivalent_character = None
if equivalent_character in radical_map:
if equivalent_character not in radicals_used:
radicals_used.add(equivalent_character)
print '(Found in %s. Equivalent character for %s: %s)' % (
glyph, part, equivalent_character)
continue
if not in_cjk_block(part):
print '(Found in %s.)' % (glyph,)
extra_glyphs.add(part)
print counts
print '%s extra glyphs required for decompositions.' % (len(extra_glyphs),)
print '%s radicals required for decomposition.' % (len(radicals_used),)
for radical in radical_map:
if radical not in glyphs:
extra_glyphs.add(radical)
print 'Final list of extra glyphs:'
print ''.join(sorted(extra_glyphs))
print '\nUsed radicals:'
print ''.join(sorted(radical for radical in radical_map
if radical not in glyphs and
(radical in radicals_used or
radical == radical_map[radical].character)))
print '\nUnused radicals:'
print ''.join(sorted(radical for radical in radical_map
if not (radical in radicals_used or
radical == radical_map[radical].character)))