diff --git a/packages/lexical-playground/__tests__/regression/7163-graphemes.spec.mjs b/packages/lexical-playground/__tests__/regression/7163-graphemes.spec.mjs new file mode 100644 index 000000000..89b85c16a --- /dev/null +++ b/packages/lexical-playground/__tests__/regression/7163-graphemes.spec.mjs @@ -0,0 +1,246 @@ +/** + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + * + */ + +import { + moveLeft, + moveRight, + pressBackspace, + selectAll, +} from '../keyboardShortcuts/index.mjs'; +import { + assertHTML, + assertSelection, + focusEditor, + html, + initialize, + test, +} from '../utils/index.mjs'; + +// Regression tests for #7163 +test.describe.configure({mode: 'parallel'}); +test.describe('Grapheme deleteCharacter', () => { + test.beforeEach(({isPlainText, isCollab, page}) => + initialize({isCollab, isPlainText, page}), + ); + [ + // Original scenarios + { + backspaceCount: 3, + caretDistance: 1, + description: 'grapheme cluster', + // Hangul grapheme cluster. + // https://www.compart.com/en/unicode/U+AC01 + grapheme: '\u1100\u1161\u11A8', + }, + { + backspaceCount: 2, + caretDistance: 1, + description: 'extended grapheme cluster', + // Tamil 'ni' grapheme cluster. + // http://unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters + grapheme: '\u0BA8\u0BBF', + }, + { + backspaceCount: 4, + caretDistance: 1, + description: 'tailored grapheme cluster', + + // Unclear why Firefox behaves differently here + firefoxCaretDistance: 2, + + // Devangari 'kshi' tailored grapheme cluster. + // http://unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters + grapheme: '\u0915\u094D\u0937\u093F', + }, + { + backspaceCount: 1, + caretDistance: 1, + description: 'Emoji sequence combined using zero-width joiners', + // https://emojipedia.org/family-woman-woman-girl-boy/ + grapheme: + '\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66', + }, + { + backspaceCount: 1, + caretDistance: 1, + description: 'Emoji sequence with skin-tone modifier', + // https://emojipedia.org/clapping-hands-medium-skin-tone/ + grapheme: '\uD83D\uDC4F\uD83C\uDFFD', + }, + // New ones + { + backspaceCount: 2, + caretDistance: 1, + description: 'Arabic text with accent', + dir: 'rtl', + grapheme: '\u0647\u064e', + }, + { + // Editors that do normalization would do this in 1 backspace + backspaceCount: 2, + caretDistance: 1, + description: 'Latin with decomposed combining character', + grapheme: 'n\u0303', + }, + { + backspaceCount: 1, + caretDistance: 1, + description: + 'Multiple codepoint Emoji with variation selector entirely in the BMP', + grapheme: '\u2764\ufe0f', + }, + { + backspaceCount: 1, + caretDistance: 1, + description: 'Multiple codepoint keycap Emoji', + grapheme: '#\ufe0f\u20e3', + }, + { + backspaceCount: 8, + caretDistance: 4, + description: 'Hindi', + // Unclear why this differs + firefoxCaretDistance: 5, + + grapheme: '\u0905\u0928\u0941\u091a\u094d\u091b\u0947\u0926', + }, + { + backspaceCount: 4, + caretDistance: 2, + description: 'Korean', + grapheme: '\u1103\u1167\u1109\u1170', + }, + { + backspaceCount: 5, + caretDistance: 5, + description: 'Emojis outside the BMP', + grapheme: '\ud83c\udf37\ud83c\udf81\ud83d\udca9\ud83d\ude1c\ud83d\udc4d', + }, + { + backspaceCount: 1, + caretDistance: 1, + description: + 'ZWJ emoji cluster with variation selection that looks like 4 characters but treated as one', + grapheme: + '\ud83d\udc69\ud83c\udffd\u200d\ud83d\udc68\ud83c\udffd\u200d\ud83d\udc76\ud83c\udffd\u200d\ud83d\udc66\ud83c\udffd', + }, + { + backspaceCount: 1, + caretDistance: 1, + description: 'Flag emoji with ZWJ and variation selector', + grapheme: '\ud83c\udff3\ufe0f\u200d\ud83c\udf08', + }, + { + backspaceCount: 1, + caretDistance: 1, + description: 'Chinese for seaborgium (surrogate pair)', + grapheme: '\ud862\udf4e', + }, + ].forEach( + ({ + backspaceCount, + caretDistance, + description, + grapheme, + dir = 'ltr', + skip = false, + firefoxCaretDistance = undefined, + }) => { + test(description, async ({page, browserName, isCollab, isPlainText}) => { + // We are only concerned about input here, not collab. + test.skip(isCollab || skip); + + // You can render a grapheme with escape sequences like this: + // const fmt = (s) => "'" + Array.from({ length: s.length }, (_, i) => `\\u${s.charCodeAt(i).toString(16).padStart(4, '0')}`).join('') + "'"; + // e.g. from dev tools copy(fmt('emoji here')) and then paste it in your text editor + await focusEditor(page); + const codeUnits = grapheme.length; + await page.keyboard.type(description); + await page.keyboard.press('Enter'); + const expectedInitialHTML = isPlainText + ? html` +

+ ${description} +
+
+

+ ` + : html` +

+ ${description} +

+


+ `; + const expectedGraphemeHTML = isPlainText + ? html` +

+ ${description} +
+ ${grapheme} +

+ ` + : html` +

+ ${description} +

+

+ ${grapheme} +

+ `; + await assertHTML(page, expectedInitialHTML, undefined, { + ignoreClasses: true, + ignoreInlineStyles: true, + }); + await page.keyboard.insertText(grapheme); + await assertHTML(page, expectedGraphemeHTML, undefined, { + ignoreClasses: true, + ignoreInlineStyles: true, + }); + const selectionFromOffset = (offset, path) => ({ + anchorOffset: offset, + anchorPath: path, + focusOffset: offset, + focusPath: path, + }); + const graphemePath = isPlainText ? [0, 2, 0] : [1, 0, 0]; + const initialPath = isPlainText ? [0] : [1]; + const initialOffset = isPlainText ? 2 : 0; + await assertSelection( + page, + selectionFromOffset(codeUnits, graphemePath), + ); + if (dir !== 'rtl') { + // It's unclear why Firefox navigates differently in these specific, + // cases, but that's not pertinent to what we're testing + const testedCaretDistance = + browserName === 'firefox' && firefoxCaretDistance !== undefined + ? firefoxCaretDistance + : caretDistance; + await moveLeft(page, testedCaretDistance); + await assertSelection(page, selectionFromOffset(0, graphemePath)); + await moveRight(page, testedCaretDistance); + } + await assertSelection( + page, + selectionFromOffset(codeUnits, graphemePath), + ); + await pressBackspace(page, backspaceCount); + await assertSelection( + page, + selectionFromOffset(initialOffset, initialPath), + ); + await assertHTML(page, expectedInitialHTML, undefined, { + ignoreClasses: true, + ignoreInlineStyles: true, + }); + await selectAll(page); + await pressBackspace(page); + }); + }, + ); +}); diff --git a/packages/lexical-react/src/LexicalHashtagPlugin.ts b/packages/lexical-react/src/LexicalHashtagPlugin.ts index 0f89717f7..2c936ab73 100644 --- a/packages/lexical-react/src/LexicalHashtagPlugin.ts +++ b/packages/lexical-react/src/LexicalHashtagPlugin.ts @@ -201,9 +201,10 @@ function getHashtagRegexStringChars(): Readonly<{ '\u1040-\u1049\u17E0-\u17E9\u1810-\u1819\u1946-\u194F\u19D0-\u19D9' + '\uFF10-\uFF19'; - // An alpha char is a unicode chars including unicode marks or - // letter or char in otherChars range - const alpha = unicodeLetters + unicodeAccents + otherChars; + // An alpha char is a unicode chars excluding unicode combining marks + // but including other chars, a hashtag must start with one of these, + // it does not make sense to have a combining mark before a base character. + const alpha = unicodeLetters + otherChars; // A numeric character is any with the number digit property, or // underscore. These characters can be included in hashtags, but a hashtag @@ -212,7 +213,7 @@ function getHashtagRegexStringChars(): Readonly<{ // Alphanumeric char is any alpha char or a unicode char with decimal // number property \p{Nd} - const alphanumeric = alpha + numeric; + const alphanumeric = alpha + unicodeAccents + numeric; const hashChars = '#\\uFF03'; // normal '#' or full-width '#' return { diff --git a/packages/lexical-selection/src/__tests__/unit/LexicalSelection.test.tsx b/packages/lexical-selection/src/__tests__/unit/LexicalSelection.test.tsx index b65d09f0a..a6ac3317f 100644 --- a/packages/lexical-selection/src/__tests__/unit/LexicalSelection.test.tsx +++ b/packages/lexical-selection/src/__tests__/unit/LexicalSelection.test.tsx @@ -239,39 +239,6 @@ describe('LexicalSelection tests', () => { expect(actualSelection.focusOffset).toBe(expectedSelection.focusOffset); } - // eslint-disable-next-line @typescript-eslint/no-unused-vars - const GRAPHEME_SCENARIOS = [ - { - description: 'grapheme cluster', - // Hangul grapheme cluster. - // https://www.compart.com/en/unicode/U+AC01 - grapheme: '\u1100\u1161\u11A8', - }, - { - description: 'extended grapheme cluster', - // Tamil 'ni' grapheme cluster. - // http://unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters - grapheme: '\u0BA8\u0BBF', - }, - { - description: 'tailored grapheme cluster', - // Devangari 'kshi' tailored grapheme cluster. - // http://unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters - grapheme: '\u0915\u094D\u0937\u093F', - }, - { - description: 'Emoji sequence combined using zero-width joiners', - // https://emojipedia.org/family-woman-woman-girl-boy/ - grapheme: - '\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66', - }, - { - description: 'Emoji sequence with skin-tone modifier', - // https://emojipedia.org/clapping-hands-medium-skin-tone/ - grapheme: '\uD83D\uDC4F\uD83C\uDFFD', - }, - ]; - const suite = [ { expectedHTML: @@ -735,64 +702,6 @@ describe('LexicalSelection tests', () => { }, // Tests need fixing: - // ...GRAPHEME_SCENARIOS.flatMap(({description, grapheme}) => [ - // { - // name: `Delete backward eliminates entire ${description} (${grapheme})`, - // inputs: [insertText(grapheme + grapheme), deleteBackward(1)], - // expectedHTML: `

${grapheme}

`, - // expectedSelection: { - // anchorPath: [0, 0, 0], - // anchorOffset: grapheme.length, - // focusPath: [0, 0, 0], - // focusOffset: grapheme.length, - // }, - // setup: emptySetup, - // }, - // { - // name: `Delete forward eliminates entire ${description} (${grapheme})`, - // inputs: [ - // insertText(grapheme + grapheme), - // moveNativeSelection([0, 0, 0], 0, [0, 0, 0], 0), - // deleteForward(), - // ], - // expectedHTML: `

${grapheme}

`, - // expectedSelection: { - // anchorPath: [0, 0, 0], - // anchorOffset: 0, - // focusPath: [0, 0, 0], - // focusOffset: 0, - // }, - // setup: emptySetup, - // }, - // { - // name: `Move backward skips over grapheme cluster (${grapheme})`, - // inputs: [insertText(grapheme + grapheme), moveBackward(1)], - // expectedHTML: `

${grapheme}${grapheme}

`, - // expectedSelection: { - // anchorPath: [0, 0, 0], - // anchorOffset: grapheme.length, - // focusPath: [0, 0, 0], - // focusOffset: grapheme.length, - // }, - // setup: emptySetup, - // }, - // { - // name: `Move forward skips over grapheme cluster (${grapheme})`, - // inputs: [ - // insertText(grapheme + grapheme), - // moveNativeSelection([0, 0, 0], 0, [0, 0, 0], 0), - // moveForward(), - // ], - // expectedHTML: `

${grapheme}${grapheme}

`, - // expectedSelection: { - // anchorPath: [0, 0, 0], - // anchorOffset: grapheme.length, - // focusPath: [0, 0, 0], - // focusOffset: grapheme.length, - // }, - // setup: emptySetup, - // }, - // ]), // { // name: 'Jump to beginning and insert', // inputs: [ diff --git a/packages/lexical/src/LexicalEvents.ts b/packages/lexical/src/LexicalEvents.ts index c2d000169..b373f905a 100644 --- a/packages/lexical/src/LexicalEvents.ts +++ b/packages/lexical/src/LexicalEvents.ts @@ -89,7 +89,7 @@ import { $updateSelectedTextFromDOM, $updateTextNodeFromDOMContent, dispatchCommand, - doesContainGrapheme, + doesContainSurrogatePair, getAnchorTextFromDOM, getDOMSelection, getDOMSelectionFromTarget, @@ -224,7 +224,9 @@ function $shouldPreventDefaultAndInsertText( // a dangling `input` event caused by execCommand('insertText'). lastBeforeInputInsertTextTimeStamp < timeStamp + 50)) || (anchorNode.isDirty() && textLength < 2) || - doesContainGrapheme(text)) && + // TODO consider if there are other scenarios when multiple code units + // should be addressed here + doesContainSurrogatePair(text)) && anchor.offset !== focus.offset && !anchorNode.isComposing()) || // Any non standard text node. diff --git a/packages/lexical/src/LexicalSelection.ts b/packages/lexical/src/LexicalSelection.ts index 9c6a454f4..0b03d022c 100644 --- a/packages/lexical/src/LexicalSelection.ts +++ b/packages/lexical/src/LexicalSelection.ts @@ -58,7 +58,7 @@ import { $hasAncestor, $isTokenOrSegmented, $setCompositionKey, - doesContainGrapheme, + doesContainSurrogatePair, getDOMSelection, getDOMTextNode, getElementByKeyOrThrow, @@ -2014,6 +2014,70 @@ function moveNativeSelection( domSelection.modify(alter, direction, granularity); } +/** + * Called by `RangeSelection.deleteCharacter` to determine if + * `this.modify('extend', isBackward, 'character')` extended the selection + * further than a user would expect for that operation. + * + * A short(?) JavaScript string vs. Unicode primer: + * + * Strings in JavaScript use an UTF-16 encoding, and the offsets into a + * string are based on those UTF-16 *code units*. This is basically a + * historical mistake (though logical at that time, decades ago), but + * can never really be fixed for compatibility reasons. + * + * In Unicode, a *code point* is the combination of one or more *code units*. + * and the range of a *code point* can fit into 21 bits. + * + * Every valid *code point* can be represented with one or two + * *UTF-16 code units*. One unit is used when the code point is in the + * Basic Multilingual Plane (BMP) and is `< 0xFFFF`. Anything outside + * of that plane is encoded with a *surrogate pair* of *code units* and + * `/[\uD800-\uDBFF][\uDC00-\uDFFF]/` is a regex that you could use to + * find any valid *surrogate pair*. As far as Unicode is concerned, these + * pairs represent a single *code point*, but in JavaScript, these pairs + * have a length of 2 (`pair.charCodeAt(n)` is really returning a + * UTF-16 *code unit*, not a unicode *code point*). It is possible to request + * a *code point* with `pair.codePointAt(0)` and enumerate code points + * in a string with `[...string]` but the offsets we work with, and + * the string length, are based in *code units* so that functionality + * is unfortunately not very useful here. + * + * This only gets us as far as *code points*. We now know that we must + * consider that each *code point* can have a length of 1 or 2 in JavaScript + * string distance. It gets even trickier because the visual representation + * of a character is a *grapheme* (approximately what the user thinks of + * as a character). A *grapheme* is one or more *code points*, and can + * essentially be arbitrarily long, as there are many ways to combine + * them. + * + * The `this.modify(…)` call has already extended our selection by one + * *grapheme* in the direction we want to delete. Sounds great, it's done + * a lot of awfully tricky work for us because this functionality has only + * recently become available in JavaScript via `Intl.Segmenter`. The + * problem is that in many cases the expected behavior of backspace or + * delete is *not always to delete a whole grapheme*. In some languages + * it's always expected that backspace ought to delete one code point, not the + * whole grapheme. In other situations such as emoji that use variation + * selectors you *do* want to delete the whole *grapheme*. + * + * In a few situations the behavior is even application dependent, such as + * with latin languages where you have multiple ways to represent the same + * character visually (e.g. a letter with an accent in one code point, or a + * letter followed by a combining mark in a second code point); some apps will + * delete the whole grapheme and others will delete only the combining mark, + * probably based on whether they perform some sort of *normalization* on their + * input to ensure that only one form is used when two sequences of code points + * can represent the same visual character. Lexical currently chooses not + * to perform any normalization so this type of combining marks will be + * deleted as a *code point* without deleting the whole *grapheme*. + * + * See also: + * https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-2/#G25564 + * https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G30602 + * https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G49537 + * https://mathiasbynens.be/notes/javascript-unicode + */ function $updateCaretSelectionForUnicodeCharacter( selection: RangeSelection, isBackward: boolean, @@ -2038,7 +2102,7 @@ function $updateCaretSelectionForUnicodeCharacter( if (startOffset !== characterOffset) { const text = anchorNode.getTextContent().slice(startOffset, endOffset); - if (!doesContainGrapheme(text)) { + if (shouldDeleteExactlyOneCodeUnit(text)) { if (isBackward) { focus.set(focus.key, characterOffset, focus.type); } else { @@ -2046,11 +2110,57 @@ function $updateCaretSelectionForUnicodeCharacter( } } } - } else { - // TODO Handling of multibyte characters } } +function shouldDeleteExactlyOneCodeUnit(text: string) { + if (__DEV__) { + invariant( + text.length > 1, + 'shouldDeleteExactlyOneCodeUnit: expecting to be called only with sequences of two or more code units', + ); + } + return !(doesContainSurrogatePair(text) || doesContainEmoji(text)); +} + +/** + * Given the wall of text in $updateCaretSelectionForUnicodeCharacter, you'd + * think that the solution might be complex, but the only currently known + * cases given the above constraints where we want to delete a whole grapheme + * are when emoji is involved. Since ES6 we can use unicode character classes + * in regexp which makes this simple. + * + * It may make sense to add to this heuristic in the future if other + * edge cases are discovered, which is why detailed notes remain. + * + * This is implemented with runtime feature detection and will always + * return false on pre-2020 platforms that do not have unicode character + * class support. + */ +const doesContainEmoji: (text: string) => boolean = (() => { + try { + const re = new RegExp('\\p{Emoji}', 'u'); + const test = re.test.bind(re); + // Sanity check a few emoji to make sure the regexp was parsed + // and works correctly. Any one of these should be sufficient, + // but they're cheap and it only runs once. + if ( + // Emoji in the BMP (heart) with variation selector + test('\u2764\ufe0f') && + // Emoji in the BMP (#) with variation selector + test('#\ufe0f\u20e3') && + // Emoji outside the BMP (thumbs up) that is encoded with a surrogate pair + test('\ud83d\udc4d') + ) { + return test; + } + } catch (e) { + // SyntaxError + } + // fallback, surrogate pair already checked + return () => false; +})(); + function $removeSegment( node: TextNode, isBackward: boolean, diff --git a/packages/lexical/src/LexicalUtils.ts b/packages/lexical/src/LexicalUtils.ts index d61cb0587..70ff7569b 100644 --- a/packages/lexical/src/LexicalUtils.ts +++ b/packages/lexical/src/LexicalUtils.ts @@ -629,7 +629,13 @@ function getNodeKeyFromDOMTree( return null; } -export function doesContainGrapheme(str: string): boolean { +/** + * Return true if `str` contains any valid surrogate pair. + * + * See also $updateCaretSelectionForUnicodeCharacter for + * a discussion on when and why this is useful. + */ +export function doesContainSurrogatePair(str: string): boolean { return /[\uD800-\uDBFF][\uDC00-\uDFFF]/g.test(str); }