import {assert, getPWD, maybeRequire} from '/lib/base'; const fs = maybeRequire('fs'); const path = maybeRequire('path'); const CHARACTER_FIELDS = ['character', 'decomposition', 'definition', 'frequency', 'kangxi_index', 'pinyin', 'simplified', 'strokes', 'traditional']; const cjklib = { characters: {}, gb2312: {}, promise: undefined, radicals: { primary_radical: {}, index_to_radical_map: {}, radical_to_index_map: {}, radical_to_character_map: {}, }, getCharacterData(character) { const result = {}; CHARACTER_FIELDS.map((field) => result[field] = cjklib.characters[field][character]); result.character = character; result.traditional = result.traditional || []; return result; }, }; CHARACTER_FIELDS.map((field) => cjklib.characters[field] = {}); // Input: String contents of a cjklib data file. // Output: a list of rows, each of which is a list of String columns. const getCJKLibRows = (data) => { const lines = data.split('\n'); return lines.filter((line) => line.length > 0 && line[0] !== '#') .map((line) => line.split(',').map( (entry) => entry.replace(/["']/g, ''))); } // Input: String contents of a TSV data file. // Output: a list of rows, each of which is a list of String columns. const getFrequencyRows = (data) => { const lines = data.split('\n'); return lines.filter((line) => line.length > 0 && line[0] !== '#') .map((line) => line.split('\t')); } // Input: String contents of a Unihan data file. // Output: a list of rows, each of which is a list of String columns. const getUnihanRows = (data) => { const lines = data.split('\n'); return lines.filter((line) => line.length > 0 && line[0] !== '#') .map((line) => line.split('\t')); } // Input: a String of the form 'U+' representing a Unicode codepoint. // Output: the character at that codepoint const parseUnicodeStr = (str) => String.fromCodePoint(parseInt(str.substr(2), 16)); // Input: the path to a Unihan data file, starting from the public directory. // Output: Promise that resolves to the String contents of that file. const readFile = (filename) => new Promise((resolve, reject) => { if (Meteor.isServer) { const filepath = path.join(getPWD(), 'public', filename); fs.readFile(filepath, 'utf8', (error, data) => { if (error) throw error; resolve(data); }); } else { $.get(filename, (data, code) => { if (code !== 'success') throw new Error(code); resolve(data); }); } }); // Promises that fill data from specific tables. // Output: Promise that fills result with a mapping character -> decomposition. // The decompositions are formatted using Ideographic Description Sequence // symbols - see the Unicode standard for more details. const fillDecompositions = (decompositions, glyphs, result) => { return Promise.all([decompositions, glyphs]).then(([rows, glyphs]) => { rows.filter((row) => parseInt(row[2], 10) === (glyphs[row[0]] || 0)) .map((row) => result[row[0]] = row[1]); }); } // Output: Promise that fills result with a mapping character -> Pinyin. const fillDefinitions = (readings, result) => { return readings.then((rows) => { rows.filter((row) => row[1] === 'kDefinition') .map((row) => result[parseUnicodeStr(row[0])] = row[2]); }); } // Output: Promise that fills result with a mapping character -> frequency rank. const fillFrequencies = (readings, result) => { return readings.then((rows) => { rows.map((row) => result[row[1]] = parseInt(row[0], 10)); }); } // Output: Promise that fills result with a mapping character -> Kangxi radical- // stroke count, which is a pair of integers [radical, extra_strokes]. const fillKangxiIndex = (readings, result) => { return readings.then((rows) => { const getIndex = (adotb) => adotb.split('.').map((x) => parseInt(x, 10)); rows.filter((row) => row[1] === 'kRSKangXi') .map((row) => result[parseUnicodeStr(row[0])] = getIndex(row[2])); }); } // Output: Promise that fills result with a mapping character -> Pinyin. const fillPinyin = (readings, result) => { return readings.then((rows) => { rows.filter((row) => row[1] === 'kMandarin') .map((row) => result[parseUnicodeStr(row[0])] = row[2]); }); } // Output: Promise that fills result with a mapping character -> stroke count. const fillStrokeCounts = (dictionary_like_data, result) => { return dictionary_like_data.then((rows) => { rows.filter((row) => row[1] === 'kTotalStrokes') .map((row) => result[parseUnicodeStr(row[0])] = parseInt(row[2], 10)); }); } // Output: Promise that fills multiple dictionaries in the result: // - index_to_radical_map: Map from index -> list of radicals at that index // - radical_to_index_map: Map from radical -> index of that radical // - primary_radical: Map from index -> primary radical at that index const fillRadicalData = (locale, radicals, result) => { return radicals.then((rows) => { rows.map((row) => { if (!result.index_to_radical_map.hasOwnProperty(row[0])) { result.index_to_radical_map[row[0]] = []; } result.index_to_radical_map[row[0]].push(row[1]); result.radical_to_index_map[row[1]] = row[0]; if (row[2] === 'R' && row[3].indexOf(locale) >= 0) { result.primary_radical[row[0]] = row[1]; } }); }); } // Output: Promise that fills result with a map from Unicode radical-codeblock // character -> equivalent Unicode CJK-codeblock (hopefully, GB2312) character. // There may be Unicode radical characters without a CJK equivalent. const fillRadicalToCharacterMap = (locale, radical_equivalent_characters, result) => { return radical_equivalent_characters.then((rows) => { rows.filter((row) => row[2].indexOf(locale) >= 0) .map((row) => result[row[0]] = row[1]); }); } // Output: Promise that fills the two maps with pointers from a given character // to its simplified and traditional variants. const fillVariants = (simplified, traditional, variants) => { return variants.then((rows) => { rows.map((row) => { if ((row[1] !== 'kSimplifiedVariant' && row[1] !== 'kTraditionalVariant') || row[0] === row[2] || row[0] === 'U+2B5B8') { // Unicode introduced an extra character U+2B5B8 matching U+613F. return; } let source = parseUnicodeStr(row[0]); let target = parseUnicodeStr(row[2]); const split = row[2].split(' '); // A number of characters have multiple simplified variants. Of these, // we should only use one of them, usually the first, but in three cases, // the second. if (split.length === 2 && ['U+937E', 'U+949F', 'U+9918'].indexOf(row[0]) >= 0) { target = parseUnicodeStr(split[1]); } if (source === target) { return; } else if (row[1] === 'kTraditionalVariant') { const swap = target; target = source; source = swap; } // The mapping from traditional characters to simplified characters is // many to one, so we can only assert that simplified[source] is unique. assert(!simplified[source] || simplified[source] === target); simplified[source] = target; traditional[target] = _.unique( (traditional[target] || []).concat([source])); }); }); } // Given the data from the GB2312 data file, fills the GB2312 result map. const fillGB2312 = (data, result) => { Array.from(data).map((character) => { if (character === '\n') return; assert(character.length === 1); const codepoint = character.codePointAt(0); assert(0x4e00 <= codepoint && codepoint <= 0x9fff); result[character] = true; }); assert(Object.keys(result).length === 6763); } // Given the rows of the locale-character map from the cjklib data, returns a // mapping from characters to the appropriate glyph in that locale. const parseLocaleGlyphMap = (locale, rows) => { const result = {}; rows.filter((row) => row[2].indexOf(locale) >= 0) .map((row) => result[row[0]] = parseInt(row[1], 10)); return result; } // Methods used for final post-processing of the loaded datasets. const cleanupCJKLibData = () => { const characters = cjklib.characters; const radicals = cjklib.radicals; const convert_astral_characters = (x) => x.length === 1 ? x : '?' const radical_to_character = (x) => radicals.radical_to_character_map[x] || x; Object.keys(characters.decomposition).map((character) => { // Convert any 'astral characters' - that is, characters outside the Basic // Multilingual Plane - to wide question marks and replace radicals with an // equivalent character with that character. const decomposition = characters.decomposition[character]; characters.decomposition[character] = Array.from(decomposition).map(convert_astral_characters) .map(radical_to_character).join(''); }); for (let i = 1; i <= 214; i++) { // All primary radicals should have an equivalent character form. const primary = radicals.primary_radical[i]; assert(radicals.radical_to_character_map.hasOwnProperty(primary)); radicals.primary_radical[i] = radicals.radical_to_character_map[primary]; radicals.index_to_radical_map[i] = radicals.index_to_radical_map[i].map(radical_to_character).unique(); } Object.keys(radicals.radical_to_index_map).map((radical) => { const character = radical_to_character(radical); if (character !== radical) { radicals.radical_to_index_map[character] = radicals.radical_to_index_map[radical]; delete radicals.radical_to_index_map[radical]; } }); delete radicals.radical_to_character_map; } Meteor.startup(() => { // cjklib database data. const locale = 'C'; const decomposition = readFile('cjklib/characterdecomposition.csv').then(getCJKLibRows); const glyphs = readFile('cjklib/localecharacterglyph.csv') .then(getCJKLibRows) .then(parseLocaleGlyphMap.bind(null, locale)); const radicals = readFile('cjklib/kangxiradical.csv').then(getCJKLibRows); const radical_equivalent_characters = readFile('cjklib/radicalequivalentcharacter.csv').then(getCJKLibRows); const radical_isolated_characters = readFile('cjklib/kangxiradicalisolatedcharacter.csv').then(getCJKLibRows); // Jun Da's character frequency data, used only for prioritization. const frequencies = readFile('junda/character_frequency.tsv') .then(getFrequencyRows); // Unihan database data. const dictionary_like_data = readFile('unihan/Unihan_DictionaryLikeData.txt').then(getUnihanRows); const radical_stroke_counts = readFile('unihan/Unihan_RadicalStrokeCounts.txt').then(getUnihanRows); const readings = readFile('unihan/Unihan_Readings.txt').then(getUnihanRows); const variants = readFile('unihan/Unihan_Variants.txt').then(getUnihanRows); cjklib.promise = Promise.all([ // Per-character data. fillDecompositions(decomposition, glyphs, cjklib.characters.decomposition), fillDefinitions(readings, cjklib.characters.definition), fillFrequencies(frequencies, cjklib.characters.frequency), fillKangxiIndex(radical_stroke_counts, cjklib.characters.kangxi_index), fillPinyin(readings, cjklib.characters.pinyin), fillStrokeCounts(dictionary_like_data, cjklib.characters.strokes), // Per-radical data. fillRadicalData(locale, radicals, cjklib.radicals), fillRadicalData(locale, radical_isolated_characters, cjklib.radicals), fillRadicalToCharacterMap(locale, radical_equivalent_characters, cjklib.radicals.radical_to_character_map), fillVariants(cjklib.characters.simplified, cjklib.characters.traditional, variants), // Extract the list of characters in the GB2312 character set. readFile('gb2312').then((data) => fillGB2312(data, cjklib.gb2312)), ]).then(cleanupCJKLibData); cjklib.promise.catch(console.error.bind(console)); }); export {cjklib};