Files
makemeahanzi/server/migration.js
2015-12-28 00:28:30 -05:00

274 lines
11 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"use strict";
const addFrequencyField = (glyph) => {
const data = cjklib.getCharacterData(glyph.character);
glyph.metadata.frequency = data.frequency;
Glyphs.save(glyph);
}
const addSimplifiedAndTraditionalFields = (glyph) => {
const data = cjklib.getCharacterData(glyph.character);
glyph.simplified = data.simplified;
glyph.traditional = data.traditional;
Glyphs.save(glyph);
}
const checkStrokeExtractorStability = (glyph) => {
const strokes = stroke_extractor.getStrokes(
glyph.stages.path, glyph.stages.bridges);
if (!_.isEqual(strokes.strokes.sort(), glyph.stages.strokes.sort())) {
console.log(`Different strokes for ${glyph.character}`);
}
}
const convertOldPathSchemaToSVGPath = (path) => {
const terms = [];
for (let segment of path) {
assert('LMQZ'.indexOf(segment.type) >= 0, segment.type);
terms.push(segment.type);
if (segment.x1 !== undefined) {
terms.push(segment.x1);
terms.push(segment.y1);
}
if (segment.x !== undefined) {
terms.push(segment.x);
terms.push(segment.y);
}
}
return terms.join(' ');
}
const dumpGlyph = (stream) => (glyph) => {
if (!glyph.stages.verified) {
return;
}
const analysis = glyph.stages.analysis;
const order = glyph.stages.order;
const data = cjklib.getCharacterData(glyph.character);
const pinyin = (glyph.metadata.pinyin || data.pinyin || '')
.split(',').map((x) => x.trim()).filter((x) => x);
const strokes = order.map((x) => glyph.stages.strokes[x.stroke]);
const medians = order.map((x) => x.median);
strokes.map((x) => assert(x));
medians.map((x) => assert(x));
const has_etymology =
analysis.etymology.hint || (analysis.etymology.type === 'pictophonetic');
const result = {
// Unicode character for this glyph. Required.
character: glyph.character,
// String definition targeted towards second-language learners. Optional.
definition: glyph.metadata.definition || data.definition,
// Comma-separated list of pronunciations of this character. May be empty.
pinyin: pinyin,
// Ideograph Description Sequence decomposition of the character. See:
// https://en.wikipedia.org/wiki/Chinese_character_description_languages#Ideographic_Description_Sequences
//
// Optional. Invalid if it starts with a full-width question mark ''.
// Note that even if the first character is a proper IDS symbol, any
// component within the decomposition may be a wide question mark as well.
// For example, if we have a decomposition of a character into a top and
// bottom component but can only recognize the top component, we might
// have a decomposition like so: '⿱逢?'
decomposition: analysis.decomposition,
// An etymology for the character. This field may be null. If present,
// it will always have a "type" field, which will be one of "ideographic",
// "pictographic", or "pictophonetic".
//
// If the type is one of the first two options, then the etymology will
// always include a string "hint" field explaining its formation.
//
// If the type is "pictophonetic", then the etymology will contain three
// other fields: "hint", "phonetic", and "semantic", each of which is
// a string and each of which may be null. The etymology should be read as:
// ${semantic} (${hint}) provides the meaning while ${phonetic}
// provides the pronunciation.
// with allowances for possible null values.
etymology: has_etymology ? analysis.etymology : undefined,
// Unicode primary radical for this character. Required.
radical: analysis.radical,
// List of SVG path data for each stroke of this character, ordered by
// proepr stroke order. Each stroke is laid out on a 1024x1024 size
// coordinate system where:
// - The upper-left corner is at position (0, 900)
// - The lower-right corner is at position (1024, 900)
// Note that the y-axes DECREASES as you move downwards, which is strage!
// To display these paths properly, you should hide render them as follows:
// <svg viewBox="0 0 1024 1024">
// <g transform="scale(1, -1) translate(0, -900)">
// <path d="STROKE[0] DATA GOES HERE"></path>
// <path d="STROKE[1] DATA GOES HERE"></path>
// ...
// </g>
// </svg>
strokes: strokes,
// A list of stroke medians, in the same coordinate system as the SVG
// paths above. These medians can be used to produce a rough stroke-order
// animation, although it is a bit tricky.
//
// Each median is a list of pairs of integers.
// This list will be as long as the strokes list.
medians: medians,
// A list of stroke medians, normalized to be in a sane coordinate system
// so that they can be used for handwriting recognition:
// - The upper-left corner is at position (0, 0)
// - The lower-right corner is at position (1, 1)
//
// Each normalized median is a list of pairs of floating-point numbers.
// This list will be as long as the strokes list.
normalized_medians: medians.map(median_util.normalizeForMatch),
// A list of mappings from strokes of this character to strokes of its
// components, as indexed in its decomposition tree. Any given entry in
// this list may be null. If an entry is not null, it will be a list of
// indices corresponding to a path down the decomposition tree.
//
// This schema is a little tricky to explain without an example. Suppose
// that the character '俢' has the decomposition: '⿰亻⿱夂彡'
//
// The third stroke in that character belongs to the radical '夂'.
// Its match would be [1, 0]. That is, if you think of the decomposition as
// a tree, it has '⿰' at its root with two children '亻' and '⿱', and
// '⿱' further has two children '夂' and '彡'. The path down the tree
// to '夂' is to take the second child of '⿰' and the first of '⿱',
// hence, [1, 0].
//
// This field can be used to generate visualizations marking each component
// within a given character, or potentially for more exotic purposes.
matches: order.map((x) => x.match),
}
stream.write(JSON.stringify(result));
stream.write('\n');
}
const migrateOldGlyphSchemaToNew = (glyph) => {
const codepoint = parseInt(glyph.name.substr(3), 16);
const character = String.fromCodePoint(codepoint);
const data = cjklib.getCharacterData(character);
assert(glyph.manual && glyph.manual.verified !== undefined,
`Glyph ${character} was not verified.`);
// Pull definition and pinyin from simplified character, if available.
let definition = undefined;
let pinyin = undefined;
if (data.simplified) {
const simplified = Glyphs.get(data.simplified);
const metadata = (simplified || {metadata: {}}).metadata;
const base = cjklib.getCharacterData(data.simplified);
definition = metadata.definition || base.definition;
pinyin = metadata.pinyin || base.pinyin;
}
const result = {
character: character,
codepoint: codepoint,
metadata: {
definition: definition,
frequency: data.frequency,
kangxi_index: data.kangxi_index,
pinyin: pinyin,
strokes: undefined,
},
stages: {
path: convertOldPathSchemaToSVGPath(glyph.path),
bridges: glyph.manual.bridges,
strokes: glyph.derived.strokes,
analysis: undefined,
order: undefined,
verified: undefined,
},
simplified: data.simplified,
traditional: data.traditional,
};
assert(result.stages.path !== undefined);
assert(result.stages.bridges !== undefined);
assert(result.stages.strokes !== undefined);
return result;
}
// Meteor methods that make use of the migration system follow.
const dumpToNewSchemaJSON = () => {
const fs = Npm.require('fs');
const path = Npm.require('path');
const filepath = path.join(getPWD(), 'server', 'makemeahanzi.txt');
const stream = fs.createWriteStream(filepath);
runMigration(dumpGlyph(stream), (() => stream.end()));
}
const loadFromOldSchemaJSON = (filename) => {
const fs = Npm.require('fs');
const path = Npm.require('path');
const filepath = path.join(getPWD(), 'public', filename);
fs.readFile(filepath, 'utf8', Meteor.bindEnvironment((error, data) => {
if (error) throw error;
const lines = data.split('\n').filter((x) => x.length > 0);
console.log(`Loaded ${lines.length} old-schema glyphs.`);
let migrated = 0;
let definition = 0;
let pinyin = 0;
for (var line of lines) {
try {
const old_glyph = JSON.parse(line);
const new_glyph = migrateOldGlyphSchemaToNew(old_glyph);
const glyph = Glyphs.get(new_glyph.character);
if (glyph && glyph.stages.verified) {
console.log(`Glyph already verified: ${glyph.character}`);
continue;
}
Glyphs.save(new_glyph);
migrated += 1;
definition += new_glyph.metadata.definition ? 1 : 0;
pinyin += new_glyph.metadata.pinyin ? 1 : 0;
} catch (error) {
console.error(error);
}
}
console.log(`Successfully migrated ${migrated} glyphs.`);
console.log(`Pulled definitions for ${definition} glyphs.`);
console.log(`Pulled pinyin for ${pinyin} glyphs.`);
}));
}
// Runs the given per-glyph callback for each glyph in the database.
// When all the glyphs are migrated, runs the completion callback.
const runMigration = (per_glyph_callback, completion_callback) => {
console.log('Running migration...');
if (per_glyph_callback) {
const codepoints =
Glyphs.find({}, {fields: {codepoint: 1}, sort: {codepoint: 1}}).fetch();
for (let i = 0; i < codepoints.length; i++) {
const glyph = Glyphs.findOne({codepoint: codepoints[i].codepoint});
assert(glyph, 'Glyphs changed during migration!');
per_glyph_callback(glyph);
if ((i + 1) % 1000 === 0) {
console.log(`Migrated ${i + 1} glyphs.`);
}
}
}
if (completion_callback) {
completion_callback();
}
console.log('Migration complete.');
}
Meteor.methods({
'export': () => {
cjklib.promise.then(Meteor.bindEnvironment(dumpToNewSchemaJSON))
.catch(console.error.bind(console));
},
'loadFromOldSchemaJSON': (filename) => {
cjklib.promise.then(
Meteor.bindEnvironment(() => loadFromOldSchemaJSON(filename)))
.catch(console.error.bind(console));
},
});
Meteor.startup(() => {
const completion_callback = undefined;
const per_glyph_callback = undefined;
if (!per_glyph_callback && !completion_callback) {
return;
}
console.log('Preparing for migration...');
const migration = () => runMigration(per_glyph_callback, completion_callback);
cjklib.promise.then(Meteor.bindEnvironment(migration))
.catch(console.error.bind(console));
});