mirror of
https://github.com/skishore/makemeahanzi.git
synced 2025-11-07 00:09:50 +08:00
97 lines
3.9 KiB
Python
Executable File
97 lines
3.9 KiB
Python
Executable File
#!/usr/bin/python
|
|
def MutableNamedTuple(name, fields):
|
|
def tostr(value):
|
|
if type(value) == unicode:
|
|
return "'%s'" % (value.encode('utf8'))
|
|
return repr(value)
|
|
class TemporaryClass(object):
|
|
__name__ = name
|
|
def __init__(self, *args):
|
|
assert(len(args) == len(fields))
|
|
for (key, value) in zip(fields, args):
|
|
self.__dict__[key] = value
|
|
def __str__(self):
|
|
return '%s(%s)' % (name, ', '.join(
|
|
tostr(self.__dict__[key]) for key in fields))
|
|
return TemporaryClass
|
|
|
|
with open('scripts/glyphs') as f:
|
|
glyphs = f.readlines()[0].strip().decode('utf8')
|
|
glyph_set = set(glyphs)
|
|
assert(len(glyphs) == len(glyph_set) == 6763)
|
|
|
|
Radical = MutableNamedTuple(
|
|
'Radical', ['number', 'character', 'definition', 'pinyin', 'strokes'])
|
|
|
|
with open('scripts/radicals') as f:
|
|
rows = [line.strip().decode('utf8').split(' ') for line in f.readlines()]
|
|
radicals = [Radical(*row) for row in rows]
|
|
radical_map = dict((radical.character, radical) for radical in radicals)
|
|
assert(len(radicals) == len(radical_map) == 214)
|
|
|
|
WikiRadical = MutableNamedTuple(
|
|
'WikiRadical', ['number', 'character', 'strokes', 'pinyin',
|
|
'unused1', 'unused2', 'unused3', 'definition',
|
|
'frequency', 'simplified', 'examples'])
|
|
|
|
with open('scripts/wiki_radicals') as f:
|
|
rows = [line.strip().decode('utf8').split('\t') for line in f.readlines()[2:]]
|
|
wiki_radicals = [WikiRadical(*row) for row in rows]
|
|
wiki_radical_map = dict((radical.character, radical)
|
|
for radical in wiki_radicals)
|
|
assert(len(wiki_radicals) == len(wiki_radical_map) == 214)
|
|
|
|
for radical in radicals:
|
|
radical.number = int(radical.number)
|
|
radical.number = int(radical.number)
|
|
radical.variants = ''
|
|
if ' ' in radical.strokes:
|
|
index = radical.strokes.find(' ')
|
|
radical.variants = radical.strokes[index + 1:]
|
|
radical.strokes = radical.strokes[:index]
|
|
radical.strokes = int(radical.strokes)
|
|
if radical.variants.startswith('('):
|
|
assert(radical.variants.endswith(')'))
|
|
radical.traditional = radical.variants[1:-1]
|
|
radical.variants = ''
|
|
else:
|
|
radical.traditional = None
|
|
radical.variants = radical.variants.split() if radical.variants else []
|
|
assert(len(radical.character) == 1)
|
|
assert(radical.traditional is None or len(radical.traditional) == 1)
|
|
assert(all(len(variant) == 1 for variant in radical.variants))
|
|
assert(radical.definition)
|
|
assert(radical.pinyin)
|
|
|
|
for (radical, wiki_radical) in zip(radicals, wiki_radicals):
|
|
print radical
|
|
print wiki_radical
|
|
assert(radical.number == wiki_radical.number)
|
|
if radical.character != wiki_radical.character:
|
|
print 'Different characters for radical %s: %s vs. %s' % (
|
|
radical.number, radical.character, wiki_radical.character)
|
|
if radical.definition != wiki_radical.definition:
|
|
print 'Different definitions for radical %s: "%s" vs. "%s"' % (
|
|
radical.number, radical.definition, wiki_radical.definition)
|
|
|
|
Decomposition = MutableNamedTuple(
|
|
'Decomposition', ['character', 'strokes', 'type', 'part1', 'strokes1',
|
|
'warning1', 'part2', 'strokes2', 'warning2',
|
|
'cangjie', 'radical'])
|
|
|
|
with open('data/decomposition/data') as f:
|
|
lines = [line for line in f.readlines() if line.startswith('\t')]
|
|
rows = [line.strip().decode('utf8').split('\t') for line in lines]
|
|
decompositions = [Decomposition(*row) for row in rows if len(row) == 11]
|
|
decomposition_map = dict((decomposition.character, decomposition)
|
|
for decomposition in decompositions)
|
|
assert(len(decomposition_map) == 21166)
|
|
|
|
for glyph in glyphs:
|
|
assert(glyph in decomposition_map), 'Missing glyph: %s' % (glyph,)
|
|
decomposition = decomposition_map[glyph]
|
|
for part in decomposition.part1 + decomposition.part2:
|
|
if part != '*' and part not in glyph_set:
|
|
#print 'Extra glyph needed for %s: %s' % (glyph, part)
|
|
continue
|