mirror of
https://github.com/skishore/makemeahanzi.git
synced 2025-11-04 22:47:24 +08:00
147 lines
5.2 KiB
Python
Executable File
147 lines
5.2 KiB
Python
Executable File
#!/usr/bin/python
|
||
# -*- coding: utf-8 -*-
|
||
import cjklib.characterlookup
|
||
import cjklib.exception
|
||
import collections
|
||
|
||
cjk = cjklib.characterlookup.CharacterLookup('C')
|
||
|
||
def MutableNamedTuple(name, fields):
|
||
def tostr(value):
|
||
if type(value) == unicode:
|
||
return "'%s'" % (value.encode('utf8'))
|
||
return repr(value)
|
||
class TemporaryClass(object):
|
||
__name__ = name
|
||
def __init__(self, *args):
|
||
assert(len(args) == len(fields))
|
||
for (key, value) in zip(fields, args):
|
||
self.__dict__[key] = value
|
||
def __str__(self):
|
||
return '%s(%s)' % (name, ', '.join(
|
||
tostr(self.__dict__[key]) for key in fields))
|
||
return TemporaryClass
|
||
|
||
def in_cjk_block(character):
|
||
if not (len(character) == 1 and 0x4e00 <= ord(character) <= 0x9fff):
|
||
print '%s is U+%s' % (character, hex(ord(character))[2:].upper())
|
||
return False
|
||
return True
|
||
|
||
with open('scripts/glyphs') as f:
|
||
glyphs = f.readlines()[0].strip().decode('utf8')
|
||
assert(all(in_cjk_block(glyph) for glyph in glyphs))
|
||
glyph_set = set(glyphs)
|
||
assert(len(glyphs) == len(glyph_set) == 6763)
|
||
|
||
Radical = MutableNamedTuple('Radical', ['number', 'character', 'definition',
|
||
'pinyin', 'traditional', 'variants'])
|
||
|
||
with open('scripts/radicals') as f:
|
||
rows = [line[:-1].decode('utf8').split('\t') for line in f.readlines()[1:]]
|
||
radicals = [Radical(*row) for row in rows]
|
||
assert(len(radicals) == 216)
|
||
|
||
print 'Homogenizing derived radicals:'
|
||
for radical in radicals:
|
||
radical.number = int(radical.number)
|
||
radical.traditional = radical.traditional or None
|
||
radical.variants = \
|
||
tuple(sorted(radical.variants.split(','))) if radical.variants else ()
|
||
in_cjk_block(radical.character)
|
||
if radical.traditional is not None:
|
||
in_cjk_block(radical.traditional)
|
||
[in_cjk_block(variant) for variant in radical.variants]
|
||
assert(radical.definition)
|
||
assert(radical.pinyin)
|
||
|
||
radical_map = dict((radical.character, radical) for radical in radicals)
|
||
assert(len(radical_map) == len(radicals))
|
||
for radical in radicals:
|
||
if radical.traditional:
|
||
assert(radical.traditional not in radical_map)
|
||
radical_map[radical.traditional] = radical
|
||
for variant in radical.variants:
|
||
assert(variant not in radical_map), variant.encode('utf8')
|
||
radical_map[variant] = radical
|
||
print 'Got %s radicals, including variants.' % (len(radical_map),)
|
||
radicals_used = set()
|
||
|
||
def decomposition_to_str(decomposition):
|
||
result = ''
|
||
for element in decomposition:
|
||
if type(element) == unicode:
|
||
result += element.encode('utf8')
|
||
else:
|
||
result += element[0].encode('utf8')
|
||
if element[1] != 0:
|
||
result += '[%s]' % (element[1],)
|
||
return result
|
||
|
||
print 'Checking decompositions:'
|
||
extra_glyphs = set()
|
||
counts = collections.defaultdict(int)
|
||
decompositions = cjk.getDecompositionEntriesDict()
|
||
for glyph in glyphs:
|
||
index = cjk.getDefaultGlyph(glyph)
|
||
if glyph in radical_map or (glyph, index) not in decompositions:
|
||
counts['indecomposable'] += 1
|
||
if glyph in radical_map:
|
||
radicals_used.add(glyph)
|
||
continue
|
||
decomposition = decompositions[(glyph, index)][0]
|
||
for element in decomposition:
|
||
if type(element) == unicode:
|
||
continue
|
||
(part, index) = element
|
||
# If the index is non-zero here, the decomposition includes a variant of
|
||
# the given part instead of the part itself. However, by inspection, the
|
||
# few hundred or so times this occurs in the 6763 characters in GB2312 seem
|
||
# largely benign; most of the time, the character is simply stretched
|
||
# horizontally or vertically.
|
||
#if index != 0:
|
||
# print 'In %s, got index %s for part %s' % (glyph, index, part)
|
||
if part in radical_map:
|
||
radicals_used.add(part)
|
||
continue
|
||
if part == u'?':
|
||
counts['unknown'] += 1
|
||
continue
|
||
if part not in glyphs and part not in extra_glyphs:
|
||
assert part != u'㣺'
|
||
if cjk.isRadicalChar(part):
|
||
try:
|
||
equivalent_character = cjk.getRadicalFormEquivalentCharacter(part)
|
||
except cjklib.exception.UnsupportedError:
|
||
equivalent_character = None
|
||
if equivalent_character in radical_map:
|
||
if equivalent_character not in radicals_used:
|
||
radicals_used.add(equivalent_character)
|
||
print '(Found in %s. Equivalent character for %s: %s)' % (
|
||
glyph, part, equivalent_character)
|
||
continue
|
||
if not in_cjk_block(part):
|
||
print '(Found in %s.)' % (glyph,)
|
||
extra_glyphs.add(part)
|
||
|
||
print counts
|
||
print '%s extra glyphs required for decompositions.' % (len(extra_glyphs),)
|
||
print '%s radicals required for decomposition.' % (len(radicals_used),)
|
||
|
||
for radical in radical_map:
|
||
if radical not in glyphs:
|
||
extra_glyphs.add(radical)
|
||
print 'Final list of extra glyphs:'
|
||
print ''.join(sorted(extra_glyphs))
|
||
|
||
print '\nUsed radicals:'
|
||
print ''.join(sorted(radical for radical in radical_map
|
||
if radical not in glyphs and
|
||
(radical in radicals_used or
|
||
radical == radical_map[radical].character)))
|
||
|
||
print '\nUnused radicals:'
|
||
print ''.join(sorted(radical for radical in radical_map
|
||
if not (radical in radicals_used or
|
||
radical == radical_map[radical].character)))
|