mirror of
https://github.com/skishore/makemeahanzi.git
synced 2025-11-03 05:48:23 +08:00
274 lines
11 KiB
JavaScript
274 lines
11 KiB
JavaScript
"use strict";
|
||
|
||
const addFrequencyField = (glyph) => {
|
||
const data = cjklib.getCharacterData(glyph.character);
|
||
glyph.metadata.frequency = data.frequency;
|
||
Glyphs.save(glyph);
|
||
}
|
||
|
||
const addSimplifiedAndTraditionalFields = (glyph) => {
|
||
const data = cjklib.getCharacterData(glyph.character);
|
||
glyph.simplified = data.simplified;
|
||
glyph.traditional = data.traditional;
|
||
Glyphs.save(glyph);
|
||
}
|
||
|
||
const checkStrokeExtractorStability = (glyph) => {
|
||
const strokes = stroke_extractor.getStrokes(
|
||
glyph.stages.path, glyph.stages.bridges);
|
||
if (!_.isEqual(strokes.strokes.sort(), glyph.stages.strokes.sort())) {
|
||
console.log(`Different strokes for ${glyph.character}`);
|
||
}
|
||
}
|
||
|
||
const convertOldPathSchemaToSVGPath = (path) => {
|
||
const terms = [];
|
||
for (let segment of path) {
|
||
assert('LMQZ'.indexOf(segment.type) >= 0, segment.type);
|
||
terms.push(segment.type);
|
||
if (segment.x1 !== undefined) {
|
||
terms.push(segment.x1);
|
||
terms.push(segment.y1);
|
||
}
|
||
if (segment.x !== undefined) {
|
||
terms.push(segment.x);
|
||
terms.push(segment.y);
|
||
}
|
||
}
|
||
return terms.join(' ');
|
||
}
|
||
|
||
const dumpGlyph = (stream) => (glyph) => {
|
||
if (!glyph.stages.verified) {
|
||
return;
|
||
}
|
||
const analysis = glyph.stages.analysis;
|
||
const order = glyph.stages.order;
|
||
const data = cjklib.getCharacterData(glyph.character);
|
||
const pinyin = (glyph.metadata.pinyin || data.pinyin || '')
|
||
.split(',').map((x) => x.trim()).filter((x) => x);
|
||
const strokes = order.map((x) => glyph.stages.strokes[x.stroke]);
|
||
const medians = order.map((x) => x.median);
|
||
strokes.map((x) => assert(x));
|
||
medians.map((x) => assert(x));
|
||
const has_etymology =
|
||
analysis.etymology.hint || (analysis.etymology.type === 'pictophonetic');
|
||
const result = {
|
||
// Unicode character for this glyph. Required.
|
||
character: glyph.character,
|
||
// String definition targeted towards second-language learners. Optional.
|
||
definition: glyph.metadata.definition || data.definition,
|
||
// Comma-separated list of pronunciations of this character. May be empty.
|
||
pinyin: pinyin,
|
||
// Ideograph Description Sequence decomposition of the character. See:
|
||
// https://en.wikipedia.org/wiki/Chinese_character_description_languages#Ideographic_Description_Sequences
|
||
//
|
||
// Optional. Invalid if it starts with a full-width question mark '?'.
|
||
// Note that even if the first character is a proper IDS symbol, any
|
||
// component within the decomposition may be a wide question mark as well.
|
||
// For example, if we have a decomposition of a character into a top and
|
||
// bottom component but can only recognize the top component, we might
|
||
// have a decomposition like so: '⿱逢?'
|
||
decomposition: analysis.decomposition,
|
||
// An etymology for the character. This field may be null. If present,
|
||
// it will always have a "type" field, which will be one of "ideographic",
|
||
// "pictographic", or "pictophonetic".
|
||
//
|
||
// If the type is one of the first two options, then the etymology will
|
||
// always include a string "hint" field explaining its formation.
|
||
//
|
||
// If the type is "pictophonetic", then the etymology will contain three
|
||
// other fields: "hint", "phonetic", and "semantic", each of which is
|
||
// a string and each of which may be null. The etymology should be read as:
|
||
// ${semantic} (${hint}) provides the meaning while ${phonetic}
|
||
// provides the pronunciation.
|
||
// with allowances for possible null values.
|
||
etymology: has_etymology ? analysis.etymology : undefined,
|
||
// Unicode primary radical for this character. Required.
|
||
radical: analysis.radical,
|
||
// List of SVG path data for each stroke of this character, ordered by
|
||
// proepr stroke order. Each stroke is laid out on a 1024x1024 size
|
||
// coordinate system where:
|
||
// - The upper-left corner is at position (0, 900)
|
||
// - The lower-right corner is at position (1024, 900)
|
||
// Note that the y-axes DECREASES as you move downwards, which is strage!
|
||
// To display these paths properly, you should hide render them as follows:
|
||
// <svg viewBox="0 0 1024 1024">
|
||
// <g transform="scale(1, -1) translate(0, -900)">
|
||
// <path d="STROKE[0] DATA GOES HERE"></path>
|
||
// <path d="STROKE[1] DATA GOES HERE"></path>
|
||
// ...
|
||
// </g>
|
||
// </svg>
|
||
strokes: strokes,
|
||
// A list of stroke medians, in the same coordinate system as the SVG
|
||
// paths above. These medians can be used to produce a rough stroke-order
|
||
// animation, although it is a bit tricky.
|
||
//
|
||
// Each median is a list of pairs of integers.
|
||
// This list will be as long as the strokes list.
|
||
medians: medians,
|
||
// A list of stroke medians, normalized to be in a sane coordinate system
|
||
// so that they can be used for handwriting recognition:
|
||
// - The upper-left corner is at position (0, 0)
|
||
// - The lower-right corner is at position (1, 1)
|
||
//
|
||
// Each normalized median is a list of pairs of floating-point numbers.
|
||
// This list will be as long as the strokes list.
|
||
normalized_medians: medians.map(median_util.normalizeForMatch),
|
||
// A list of mappings from strokes of this character to strokes of its
|
||
// components, as indexed in its decomposition tree. Any given entry in
|
||
// this list may be null. If an entry is not null, it will be a list of
|
||
// indices corresponding to a path down the decomposition tree.
|
||
//
|
||
// This schema is a little tricky to explain without an example. Suppose
|
||
// that the character '俢' has the decomposition: '⿰亻⿱夂彡'
|
||
//
|
||
// The third stroke in that character belongs to the radical '夂'.
|
||
// Its match would be [1, 0]. That is, if you think of the decomposition as
|
||
// a tree, it has '⿰' at its root with two children '亻' and '⿱', and
|
||
// '⿱' further has two children '夂' and '彡'. The path down the tree
|
||
// to '夂' is to take the second child of '⿰' and the first of '⿱',
|
||
// hence, [1, 0].
|
||
//
|
||
// This field can be used to generate visualizations marking each component
|
||
// within a given character, or potentially for more exotic purposes.
|
||
matches: order.map((x) => x.match),
|
||
}
|
||
stream.write(JSON.stringify(result));
|
||
stream.write('\n');
|
||
}
|
||
|
||
const migrateOldGlyphSchemaToNew = (glyph) => {
|
||
const codepoint = parseInt(glyph.name.substr(3), 16);
|
||
const character = String.fromCodePoint(codepoint);
|
||
const data = cjklib.getCharacterData(character);
|
||
assert(glyph.manual && glyph.manual.verified !== undefined,
|
||
`Glyph ${character} was not verified.`);
|
||
// Pull definition and pinyin from simplified character, if available.
|
||
let definition = undefined;
|
||
let pinyin = undefined;
|
||
if (data.simplified) {
|
||
const simplified = Glyphs.get(data.simplified);
|
||
const metadata = (simplified || {metadata: {}}).metadata;
|
||
const base = cjklib.getCharacterData(data.simplified);
|
||
definition = metadata.definition || base.definition;
|
||
pinyin = metadata.pinyin || base.pinyin;
|
||
}
|
||
const result = {
|
||
character: character,
|
||
codepoint: codepoint,
|
||
metadata: {
|
||
definition: definition,
|
||
frequency: data.frequency,
|
||
kangxi_index: data.kangxi_index,
|
||
pinyin: pinyin,
|
||
strokes: undefined,
|
||
},
|
||
stages: {
|
||
path: convertOldPathSchemaToSVGPath(glyph.path),
|
||
bridges: glyph.manual.bridges,
|
||
strokes: glyph.derived.strokes,
|
||
analysis: undefined,
|
||
order: undefined,
|
||
verified: undefined,
|
||
},
|
||
simplified: data.simplified,
|
||
traditional: data.traditional,
|
||
};
|
||
assert(result.stages.path !== undefined);
|
||
assert(result.stages.bridges !== undefined);
|
||
assert(result.stages.strokes !== undefined);
|
||
return result;
|
||
}
|
||
|
||
// Meteor methods that make use of the migration system follow.
|
||
|
||
const dumpToNewSchemaJSON = () => {
|
||
const fs = Npm.require('fs');
|
||
const path = Npm.require('path');
|
||
const filepath = path.join(getPWD(), 'server', 'makemeahanzi.txt');
|
||
const stream = fs.createWriteStream(filepath);
|
||
runMigration(dumpGlyph(stream), (() => stream.end()));
|
||
}
|
||
|
||
const loadFromOldSchemaJSON = (filename) => {
|
||
const fs = Npm.require('fs');
|
||
const path = Npm.require('path');
|
||
const filepath = path.join(getPWD(), 'public', filename);
|
||
fs.readFile(filepath, 'utf8', Meteor.bindEnvironment((error, data) => {
|
||
if (error) throw error;
|
||
const lines = data.split('\n').filter((x) => x.length > 0);
|
||
console.log(`Loaded ${lines.length} old-schema glyphs.`);
|
||
let migrated = 0;
|
||
let definition = 0;
|
||
let pinyin = 0;
|
||
for (var line of lines) {
|
||
try {
|
||
const old_glyph = JSON.parse(line);
|
||
const new_glyph = migrateOldGlyphSchemaToNew(old_glyph);
|
||
const glyph = Glyphs.get(new_glyph.character);
|
||
if (glyph && glyph.stages.verified) {
|
||
console.log(`Glyph already verified: ${glyph.character}`);
|
||
continue;
|
||
}
|
||
Glyphs.save(new_glyph);
|
||
migrated += 1;
|
||
definition += new_glyph.metadata.definition ? 1 : 0;
|
||
pinyin += new_glyph.metadata.pinyin ? 1 : 0;
|
||
} catch (error) {
|
||
console.error(error);
|
||
}
|
||
}
|
||
console.log(`Successfully migrated ${migrated} glyphs.`);
|
||
console.log(`Pulled definitions for ${definition} glyphs.`);
|
||
console.log(`Pulled pinyin for ${pinyin} glyphs.`);
|
||
}));
|
||
}
|
||
|
||
// Runs the given per-glyph callback for each glyph in the database.
|
||
// When all the glyphs are migrated, runs the completion callback.
|
||
const runMigration = (per_glyph_callback, completion_callback) => {
|
||
console.log('Running migration...');
|
||
if (per_glyph_callback) {
|
||
const codepoints =
|
||
Glyphs.find({}, {fields: {codepoint: 1}, sort: {codepoint: 1}}).fetch();
|
||
for (let i = 0; i < codepoints.length; i++) {
|
||
const glyph = Glyphs.findOne({codepoint: codepoints[i].codepoint});
|
||
assert(glyph, 'Glyphs changed during migration!');
|
||
per_glyph_callback(glyph);
|
||
if ((i + 1) % 1000 === 0) {
|
||
console.log(`Migrated ${i + 1} glyphs.`);
|
||
}
|
||
}
|
||
}
|
||
if (completion_callback) {
|
||
completion_callback();
|
||
}
|
||
console.log('Migration complete.');
|
||
}
|
||
|
||
Meteor.methods({
|
||
'export': () => {
|
||
cjklib.promise.then(Meteor.bindEnvironment(dumpToNewSchemaJSON))
|
||
.catch(console.error.bind(console));
|
||
},
|
||
'loadFromOldSchemaJSON': (filename) => {
|
||
cjklib.promise.then(
|
||
Meteor.bindEnvironment(() => loadFromOldSchemaJSON(filename)))
|
||
.catch(console.error.bind(console));
|
||
},
|
||
});
|
||
|
||
Meteor.startup(() => {
|
||
const completion_callback = undefined;
|
||
const per_glyph_callback = undefined;
|
||
if (!per_glyph_callback && !completion_callback) {
|
||
return;
|
||
}
|
||
console.log('Preparing for migration...');
|
||
const migration = () => runMigration(per_glyph_callback, completion_callback);
|
||
cjklib.promise.then(Meteor.bindEnvironment(migration))
|
||
.catch(console.error.bind(console));
|
||
});
|