diff --git a/node-tests/test-xml.ts b/node-tests/test-xml.ts index d5703a0a4..66e4ce2a2 100644 --- a/node-tests/test-xml.ts +++ b/node-tests/test-xml.ts @@ -32,4 +32,15 @@ describe("xml parser", () => { parser.parse("<>"&'"); assert.equal("<>\"&'", last_data); }); + + it("resolves greek letter entities in attributes", () => { + parser.parse("blah"); + assert.equal("Ω", last_attrs.text); + }); + + it("resolves entities in element text", () => { + parser.parse("Ω"); + assert.equal("Ω", last_data); + }); + }); diff --git a/xml/xml.ts b/xml/xml.ts index 4c94d8414..cef63acf7 100644 --- a/xml/xml.ts +++ b/xml/xml.ts @@ -62,35 +62,35 @@ export class ParserEvent implements definition.ParserEvent { } } -var _ampCodes = {Tab: 9, NewLine: 10, excl: 33, quot: 34, QUOT: 34, num: 35, dollar: 36, percent: 37, amp: 38, AMP: 38, apos: 39, lpar: 40, rpar: 41, ast: 42, midast: 42, plus: 43, comma: 44, period: 46, sol: 47, colon: 58, semi: 59, lt: 60, LT: 60, equals: 61, gt: 62, GT: 62, quest: 63, commat: 64, lsqb: 91, lbrack: 91, bsol: 92, rsqb: 92, rbrack: 92, Hat: 94, lowbar: 95, grave: 96, DiacriticalGrave: 96, lcub: 123, lbrace: 123, verbar: 124, vert: 124, VerticalLine: 124, rcub: 125, rbrace: 125, brkbar:166, hibar: 175, Dstrok: 208, fnof: 402, imped: 437, gacute: 501, jmath: 567, circ: 710, caron: 711, Hacek: 711, breve: 728, Breve: 728, dot: 729, DiacriticalDot: 729, ring: 730, ogon: 731, tilde: 732, DiacriticalTilde: 732, dblac: 733, DiacriticalDoubleAcute: 733, DownBreve: 785, UnderBar: 818, epsiv: 949, varepsilon: 949, sigmav:962, varsigma: 962, thetav: 977, vartheta: 977, thetasym: 977, Upsi: 978, upsih: 978, straightphi: 981, piv: 982, varpi: 982, Gammad: 988, gammad: 989, digamma: 989, kappav: 1008, varkappa: 1008, rhov: 1009, varrho: 1009, epsi:1013, straightepsilon: 1013, bepsi: 1014, backepsilon: 1014, /* Skipped Codes 1015 - 1119 */ euro: 8364, trade: 8482, TRADE: 8482, forall: 8704, part: 8706, larr: 8592, rarr: 8593, hyphen: 8208, dash: 8208, ndash: 8211, mdash: 8212, horbar: 8213, Vert: 8214, Verbar: 8214, lsquo: 8216, OpenCurlyQuote: 8216, rsquo: 8217, rsquor: 8217, CloseCurlyQuote: 8217, lsquor: 8218, sbquo: 8218, ldquo: 8220, OpenCurlyDoubleQuote: 8220, rdquo: 8221, rdquor: 8221, CloseCurlyDoubleQuote: 8221, ldquor: 8222, bdquo: 8222, dagger: 8224, Dagger: 8225, ddagger: 8225, bull: 8226, bullet: 8226, nldr: 8229, hellip: 8230, mldr: 8230, hybull: 8259, tdot: 8411, TripleDot: 8411 ,DotDot: 8412, star: 9734, phone: 9742, spades: 9824, clubs: 9827, hearts: 9829, diams: 9830, female: 9792, male: 9794, check: 10003, checkmark: 10003, cross: 10007, VerticalSeparator: 10072, EmptySmallSquare: 9723, FilledSmallSquare: 9724, starf: 9733, bigstar: 9733, square: 9633, squ: 9633, Square: 9633}; -var _amp160List = ['nbsp','iexcl','cent','pound','curren','yen','brvbar','sect','uml','copy','ordf','laquo','not','shy','reg','macr','deg','plusmn','sup2','sup3','acute','micro','para','middot','cedil','sup1','ordm','raquo','frac14','frac12','frac34','iquest','Agrave','Aacute','Acirc','Atilde','Auml','Aring','AElig','Ccedil','Egrave','Eacute','Ecirc','Euml','Igrave','Iacute','Icirc','Iuml','ETH','Ntilde','Ograve','Oacute','Ocirc','Otilde','Ouml','times','Oslash','Ugrave','Uacute','Ucirc','Uuml','Yacute','THORN','szlig','agrave','aacute','acirc','atilde','auml','aring','aelig','ccedil','egrave','eacute','ecirc','euml','igrave','iacute','icirc','iuml','eth','ntilde','ograve','oacute','ocirc','otilde','ouml','divide','oslash','ugrave','uacute','ucirc','uuml','yacute','thorn','yuml','Amacr','amacr','Abreve','abreve','Aogon','aogon']; -var _ampGreekUpper = ['Alpha','Beta','Gamma','Delta','Epsilon','Zeta','Eta','Theta','Iota','Kappa','Lambda','Mu','Nu','Xi','Omicron','Pi','Rho','M!SS!NG','Sigma','Tau','Upsilon','Phi','Chi','Psi','Omega']; -var _ampGreekLower = ['alpha','beta','gamma','delta','epsilon','zeta','eta','theta','iota','kappa','lambda','mu','nu','xi','omicron','pi','rho','sigmaf','sigma','tau','upsilon','phi','chi','psi','omega']; +var _ampCodes; var _entitySearchRegEx = /&#(\d+);|&#x([0123456789abcdef]+);|&(\w+);/ig; +function _generateAmpMap(): any { + var objCodes = {Tab: 9, NewLine: 10, excl: 33, quot: 34, QUOT: 34, num: 35, dollar: 36, percent: 37, amp: 38, AMP: 38, apos: 39, lpar: 40, rpar: 41, ast: 42, midast: 42, plus: 43, comma: 44, period: 46, sol: 47, colon: 58, semi: 59, lt: 60, LT: 60, equals: 61, gt: 62, GT: 62, quest: 63, commat: 64, lsqb: 91, lbrack: 91, bsol: 92, rsqb: 92, rbrack: 92, Hat: 94, lowbar: 95, grave: 96, DiacriticalGrave: 96, lcub: 123, lbrace: 123, verbar: 124, vert: 124, VerticalLine: 124, rcub: 125, rbrace: 125, nbsp: 160, iexcl: 161, cent: 162, pound:163, curren: 164, yen: 165, brvbar: 166, brkbar:166, sect:167, uml:168, copy:169, ordf:170, laquo:171, not: 172, shy:173, reg:174, macr:175, hibar: 175, deg:176, plusmn: 177, sup2: 178, sup3: 179, acute:180, micro:181, para:182, middot:183, cedil:184, sup1:185, ordm:186, raquo:187, frac14:188, frac12:189, frac34:190, iquest:191, Agrave:192, Aacute:193, Acirc:194, Atilde:195, Auml:196, Aring:197, AElig:198, Ccedil:199, Egrave:200, Eacute:201, Ecirc:202, Euml:203, Igrave:204, Iacute:205, Icirc:206, Iuml:207, ETH:208, Dstrok: 208, Ntilde:209, Ograve:210, Oacute:211, Ocirc:212, Otilde: 213, Ouml:214, times:215, Oslash:216, Ugrave:217, Uacute:218, Ucirc:219, Uuml:220, Yacute:221, THORN:222, szlig:223, agrave:224, aacute:225, acirc:226, atilde:227, auml:228, aring:229, aelig:230, ccedil:231, egrave:232, eacute:233, ecirc:234, euml:235, igrave:236, iacute:237, icirc:238, iuml:239, eth:240, ntilde:241, ograve:242, oacute:243, ocirc:244, otilde:245, ouml:246, divide:247, oslash:248, ugrave:249, uacute:250, ucirc:251, uuml:252, yacute:253, thorn:254, yuml: 255, fnof: 402, imped: 437, gacute: 501, jmath: 567, circ: 710, caron: 711, Hacek: 711, breve: 728, Breve: 728, dot: 729, DiacriticalDot: 729, ring: 730, ogon: 731, tilde: 732, DiacriticalTilde: 732, dblac: 733, DiacriticalDoubleAcute: 733, DownBreve: 785, UnderBar: 818, Alpha: 913, Beta: 914, Gamma: 915, Delta: 916, Epsilon: 917, Zeta: 918, Eta: 919, Theta: 920, Iota: 921, Kappa: 922, Lambda: 923, Mu: 924, Nu: 925, Xi: 926, Omicron: 927, Pi: 928, Rho: 929 /* 930 is not real */, Sigma: 931, Tau: 932, Upsilon: 933, Phi: 934, Chi: 935, Psi: 936, Omega: 937, alpha: 945, beta: 946, gamma: 947, delta: 948, epsilon: 949, epsiv: 949, varepsilon: 949, zeta: 950, eta: 951, theta: 952, iota: 953, kappa: 954, lambda: 955, mu: 956, nu: 957, xi: 958, omicron: 959, pi: 960, rho: 961, sigmaf: 962, sigmav: 962, varsigma: 962, sigma: 963, tau: 964, upsilon: 965, phi: 966, chi: 967, psi: 968, omega: 969, thetav: 977, vartheta: 977, thetasym: 977, Upsi: 978, upsih: 978, straightphi: 981, piv: 982, varpi: 982, Gammad: 988, gammad: 989, digamma: 989, kappav: 1008, varkappa: 1008, rhov: 1009, varrho: 1009, epsi:1013, straightepsilon: 1013, bepsi: 1014, backepsilon: 1014, /* Skipped Codes 1015 - 1119 */ euro: 8364, trade: 8482, TRADE: 8482, forall: 8704, part: 8706, larr: 8592, rarr: 8593, hyphen: 8208, dash: 8208, ndash: 8211, mdash: 8212, horbar: 8213, Vert: 8214, Verbar: 8214, lsquo: 8216, OpenCurlyQuote: 8216, rsquo: 8217, rsquor: 8217, CloseCurlyQuote: 8217, lsquor: 8218, sbquo: 8218, ldquo: 8220, OpenCurlyDoubleQuote: 8220, rdquo: 8221, rdquor: 8221, CloseCurlyDoubleQuote: 8221, ldquor: 8222, bdquo: 8222, dagger: 8224, Dagger: 8225, ddagger: 8225, bull: 8226, bullet: 8226, nldr: 8229, hellip: 8230, mldr: 8230, hybull: 8259, tdot: 8411, TripleDot: 8411 ,DotDot: 8412, star: 9734, phone: 9742, spades: 9824, clubs: 9827, hearts: 9829, diams: 9830, female: 9792, male: 9794, check: 10003, checkmark: 10003, cross: 10007, VerticalSeparator: 10072, EmptySmallSquare: 9723, FilledSmallSquare: 9724, starf: 9733, bigstar: 9733, square: 9633, squ: 9633, Square: 9633}; + var ampCodes = new Map(); + for (var key in objCodes) { + if (objCodes.hasOwnProperty(key)) { + ampCodes.set(key, objCodes[key]); + } + } + return ampCodes; +} + function _HandleAmpEntities(found: string, decimalValue: string, hexValue: string, wordValue: string): string { if (wordValue) { - if (_ampCodes.hasOwnProperty(wordValue)) { - return String.fromCharCode(_ampCodes[wordValue]); + if (!_ampCodes) { + _ampCodes = _generateAmpMap(); } - var idx = _amp160List.indexOf(wordValue); - if (idx >= 0) { - return String.fromCharCode(160 + idx); - } - idx = _ampGreekUpper.indexOf(wordValue); - if (idx >= 0) { - return String.fromCharCode(913 + idx); - } - idx = _ampGreekLower.indexOf(wordValue); - if (idx >= 0) { - return String.fromCharCode(945 + idx); + var res = _ampCodes.get(wordValue); + if (res) { + return String.fromCharCode(res); } // Invalid word; so we just return it return found; } if (decimalValue) { - return String.fromCharCode(parseInt(decimalValue,10)); + return String.fromCharCode(parseInt(decimalValue, 10)); } return String.fromCharCode(parseInt(hexValue, 16)); @@ -143,7 +143,7 @@ export class XmlParser implements definition.XmlParser { }); this._parser.on('textNode', function (text, uq) { - var data = uq(text);// Decode entity references such as < and > + var data = uq(XmlParser._dereferenceEntities(text));// Decode entity references such as < and > onEvent(new ParserEvent(ParserEventType.Text, undefined, undefined, undefined, undefined, data)); }); @@ -262,26 +262,7 @@ export class XmlParser implements definition.XmlParser { s = String(s); if (s.length > 3 && s.indexOf('&') !== -1) { s = s.replace(_entitySearchRegEx, _HandleAmpEntities); - /* if (s.indexOf('<') !== -1) { - s = s.replace(/</g, '<'); - } - - if (s.indexOf('>') !== -1) { - s = s.replace(/>/g, '>'); - } - - if (s.indexOf('&') !== -1) { - s = s.replace(/&/g, '&'); - } - - if (s.indexOf(''') !== -1) { - s = s.replace(/'/g, "'"); - } - - if (s.indexOf('"') !== -1) { - s = s.replace(/"/g, '"'); - }*/ - }; + } return s; }