Files
makemeahanzi/scripts/analysis.py
2015-09-24 01:32:45 -04:00

119 lines
3.8 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/python
# -*- coding: utf-8 -*-
import cjklib.characterlookup
import cjklib.exception
import collections
cjk = cjklib.characterlookup.CharacterLookup('C')
def MutableNamedTuple(name, fields):
def tostr(value):
if type(value) == unicode:
return "'%s'" % (value.encode('utf8'))
return repr(value)
class TemporaryClass(object):
__name__ = name
def __init__(self, *args):
assert(len(args) == len(fields))
for (key, value) in zip(fields, args):
self.__dict__[key] = value
def __str__(self):
return '%s(%s)' % (name, ', '.join(
tostr(self.__dict__[key]) for key in fields))
return TemporaryClass
def in_cjk_block(character):
if not (len(character) == 1 and 0x4e00 <= ord(character) <= 0x9fff):
print '%s is U+%s' % (character, hex(ord(character))[2:].upper())
return False
return True
with open('scripts/glyphs') as f:
glyphs = f.readlines()[0].strip().decode('utf8')
assert(all(in_cjk_block(glyph) for glyph in glyphs))
glyph_set = set(glyphs)
assert(len(glyphs) == len(glyph_set) == 6763)
radical_map = {}
for i in range(214):
variants = cjk.getKangxiRadicalRepresentativeCharacters(i + 1)
for variant in variants:
radical_map[variant] = i + 1
print 'Got %s radicals, including variants.' % (len(radical_map),)
radicals_used = set()
def decomposition_to_str(decomposition):
result = ''
for element in decomposition:
if type(element) == unicode:
result += element.encode('utf8')
else:
result += element[0].encode('utf8')
if element[1] != 0:
result += '[%s]' % (element[1],)
return result
print 'Checking decompositions:'
extra_glyphs = set()
counts = collections.defaultdict(int)
decompositions = cjk.getDecompositionEntriesDict()
for glyph in glyphs:
index = cjk.getDefaultGlyph(glyph)
if glyph in radical_map or (glyph, index) not in decompositions:
counts['indecomposable'] += 1
if glyph in radical_map:
radicals_used.add(glyph)
continue
decomposition = decompositions[(glyph, index)][0]
for element in decomposition:
if type(element) == unicode:
continue
(part, index) = element
# If the index is non-zero here, the decomposition includes a variant of
# the given part instead of the part itself. However, by inspection, the
# few hundred or so times this occurs in the 6763 characters in GB2312 seem
# largely benign; most of the time, the character is simply stretched
# horizontally or vertically.
#if index != 0:
# print 'In %s, got index %s for part %s' % (glyph, index, part)
if part in radical_map:
radicals_used.add(part)
continue
if part == u'':
counts['unknown'] += 1
continue
if part not in glyphs and part not in extra_glyphs:
if not in_cjk_block(part):
print '(Found in %s.)' % (glyph,)
extra_glyphs.add(part)
print counts
print '%s extra glyphs required for decompositions.' % (len(extra_glyphs),)
print '%s radicals required for decomposition.' % (len(radicals_used),)
def equivalent_form(radical):
try:
return cjk.getRadicalFormEquivalentCharacter(radical)
except:
return radical
def collapse(radicals):
return set(map(equivalent_form, radicals))
extra_glyphs = collapse(extra_glyphs)
radical_map = collapse(radical_map)
radicals_used = collapse(radicals_used)
for radical in radical_map:
if radical not in glyphs:
extra_glyphs.add(radical)
print 'Final list of extra glyphs:'
print ''.join(sorted(extra_glyphs))
print '\nUsed radicals without glyphs:'
print ''.join(sorted(radical for radical in radical_map
if radical not in glyphs and radical in radicals_used))
print '\nUnused radicals:'
print ''.join(sorted(radical for radical in radical_map
if radical not in glyphs and radical not in radicals_used))