[lexical] Bug Fix: Workaround for delete character with emoji grapheme customers that do not include non-BMP code points (#7175)

2025-05-17 23:26:16 +08:00 · 2025-02-18 06:57:09 -08:00
parent 94db7c71d7
commit 25e5314c7d
6 changed files with 376 additions and 102 deletions
--- a/packages/lexical-playground/tests/regression/7163-graphemes.spec.mjs
+++ b/packages/lexical-playground/tests/regression/7163-graphemes.spec.mjs
@ -0,0 +1,246 @@
+/**
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ */
+
+import {
+  moveLeft,
+  moveRight,
+  pressBackspace,
+  selectAll,
+} from '../keyboardShortcuts/index.mjs';
+import {
+  assertHTML,
+  assertSelection,
+  focusEditor,
+  html,
+  initialize,
+  test,
+} from '../utils/index.mjs';
+
+// Regression tests for #7163
+test.describe.configure({mode: 'parallel'});
+test.describe('Grapheme deleteCharacter', () => {
+  test.beforeEach(({isPlainText, isCollab, page}) =>
+    initialize({isCollab, isPlainText, page}),
+  );
+  [
+    // Original scenarios
+    {
+      backspaceCount: 3,
+      caretDistance: 1,
+      description: 'grapheme cluster',
+      // Hangul grapheme cluster.
+      // https://www.compart.com/en/unicode/U+AC01
+      grapheme: '\u1100\u1161\u11A8',
+    },
+    {
+      backspaceCount: 2,
+      caretDistance: 1,
+      description: 'extended grapheme cluster',
+      // Tamil 'ni' grapheme cluster.
+      // http://unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters
+      grapheme: '\u0BA8\u0BBF',
+    },
+    {
+      backspaceCount: 4,
+      caretDistance: 1,
+      description: 'tailored grapheme cluster',
+
+      // Unclear why Firefox behaves differently here
+      firefoxCaretDistance: 2,
+
+      // Devangari 'kshi' tailored grapheme cluster.
+      // http://unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters
+      grapheme: '\u0915\u094D\u0937\u093F',
+    },
+    {
+      backspaceCount: 1,
+      caretDistance: 1,
+      description: 'Emoji sequence combined using zero-width joiners',
+      // https://emojipedia.org/family-woman-woman-girl-boy/
+      grapheme:
+        '\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66',
+    },
+    {
+      backspaceCount: 1,
+      caretDistance: 1,
+      description: 'Emoji sequence with skin-tone modifier',
+      // https://emojipedia.org/clapping-hands-medium-skin-tone/
+      grapheme: '\uD83D\uDC4F\uD83C\uDFFD',
+    },
+    // New ones
+    {
+      backspaceCount: 2,
+      caretDistance: 1,
+      description: 'Arabic text with accent',
+      dir: 'rtl',
+      grapheme: '\u0647\u064e',
+    },
+    {
+      // Editors that do normalization would do this in 1 backspace
+      backspaceCount: 2,
+      caretDistance: 1,
+      description: 'Latin with decomposed combining character',
+      grapheme: 'n\u0303',
+    },
+    {
+      backspaceCount: 1,
+      caretDistance: 1,
+      description:
+        'Multiple codepoint Emoji with variation selector entirely in the BMP',
+      grapheme: '\u2764\ufe0f',
+    },
+    {
+      backspaceCount: 1,
+      caretDistance: 1,
+      description: 'Multiple codepoint keycap Emoji',
+      grapheme: '#\ufe0f\u20e3',
+    },
+    {
+      backspaceCount: 8,
+      caretDistance: 4,
+      description: 'Hindi',
+      // Unclear why this differs
+      firefoxCaretDistance: 5,
+
+      grapheme: '\u0905\u0928\u0941\u091a\u094d\u091b\u0947\u0926',
+    },
+    {
+      backspaceCount: 4,
+      caretDistance: 2,
+      description: 'Korean',
+      grapheme: '\u1103\u1167\u1109\u1170',
+    },
+    {
+      backspaceCount: 5,
+      caretDistance: 5,
+      description: 'Emojis outside the BMP',
+      grapheme: '\ud83c\udf37\ud83c\udf81\ud83d\udca9\ud83d\ude1c\ud83d\udc4d',
+    },
+    {
+      backspaceCount: 1,
+      caretDistance: 1,
+      description:
+        'ZWJ emoji cluster with variation selection that looks like 4 characters but treated as one',
+      grapheme:
+        '\ud83d\udc69\ud83c\udffd\u200d\ud83d\udc68\ud83c\udffd\u200d\ud83d\udc76\ud83c\udffd\u200d\ud83d\udc66\ud83c\udffd',
+    },
+    {
+      backspaceCount: 1,
+      caretDistance: 1,
+      description: 'Flag emoji with ZWJ and variation selector',
+      grapheme: '\ud83c\udff3\ufe0f\u200d\ud83c\udf08',
+    },
+    {
+      backspaceCount: 1,
+      caretDistance: 1,
+      description: 'Chinese for seaborgium (surrogate pair)',
+      grapheme: '\ud862\udf4e',
+    },
+  ].forEach(
+    ({
+      backspaceCount,
+      caretDistance,
+      description,
+      grapheme,
+      dir = 'ltr',
+      skip = false,
+      firefoxCaretDistance = undefined,
+    }) => {
+      test(description, async ({page, browserName, isCollab, isPlainText}) => {
+        // We are only concerned about input here, not collab.
+        test.skip(isCollab || skip);
+
+        // You can render a grapheme with escape sequences like this:
+        // const fmt = (s) => "'" + Array.from({ length: s.length }, (_, i) => `\\u${s.charCodeAt(i).toString(16).padStart(4, '0')}`).join('') + "'";
+        // e.g. from dev tools copy(fmt('emoji here')) and then paste it in your text editor
+        await focusEditor(page);
+        const codeUnits = grapheme.length;
+        await page.keyboard.type(description);
+        await page.keyboard.press('Enter');
+        const expectedInitialHTML = isPlainText
+          ? html`
+              <p dir="ltr">
+                <span data-lexical-text="true">${description}</span>
+                <br />
+                <br />
+              </p>
+            `
+          : html`
+              <p dir="ltr">
+                <span data-lexical-text="true">${description}</span>
+              </p>
+              <p><br /></p>
+            `;
+        const expectedGraphemeHTML = isPlainText
+          ? html`
+              <p dir="ltr">
+                <span data-lexical-text="true">${description}</span>
+                <br />
+                <span data-lexical-text="true">${grapheme}</span>
+              </p>
+            `
+          : html`
+              <p dir="ltr">
+                <span data-lexical-text="true">${description}</span>
+              </p>
+              <p dir="${dir}">
+                <span data-lexical-text="true">${grapheme}</span>
+              </p>
+            `;
+        await assertHTML(page, expectedInitialHTML, undefined, {
+          ignoreClasses: true,
+          ignoreInlineStyles: true,
+        });
+        await page.keyboard.insertText(grapheme);
+        await assertHTML(page, expectedGraphemeHTML, undefined, {
+          ignoreClasses: true,
+          ignoreInlineStyles: true,
+        });
+        const selectionFromOffset = (offset, path) => ({
+          anchorOffset: offset,
+          anchorPath: path,
+          focusOffset: offset,
+          focusPath: path,
+        });
+        const graphemePath = isPlainText ? [0, 2, 0] : [1, 0, 0];
+        const initialPath = isPlainText ? [0] : [1];
+        const initialOffset = isPlainText ? 2 : 0;
+        await assertSelection(
+          page,
+          selectionFromOffset(codeUnits, graphemePath),
+        );
+        if (dir !== 'rtl') {
+          // It's unclear why Firefox navigates differently in these specific,
+          // cases, but that's not pertinent to what we're testing
+          const testedCaretDistance =
+            browserName === 'firefox' && firefoxCaretDistance !== undefined
+              ? firefoxCaretDistance
+              : caretDistance;
+          await moveLeft(page, testedCaretDistance);
+          await assertSelection(page, selectionFromOffset(0, graphemePath));
+          await moveRight(page, testedCaretDistance);
+        }
+        await assertSelection(
+          page,
+          selectionFromOffset(codeUnits, graphemePath),
+        );
+        await pressBackspace(page, backspaceCount);
+        await assertSelection(
+          page,
+          selectionFromOffset(initialOffset, initialPath),
+        );
+        await assertHTML(page, expectedInitialHTML, undefined, {
+          ignoreClasses: true,
+          ignoreInlineStyles: true,
+        });
+        await selectAll(page);
+        await pressBackspace(page);
+      });
+    },
+  );
+});
--- a/packages/lexical-react/src/LexicalHashtagPlugin.ts
+++ b/packages/lexical-react/src/LexicalHashtagPlugin.ts
@ -201,9 +201,10 @@ function getHashtagRegexStringChars(): Readonly<{
    '\u1040-\u1049\u17E0-\u17E9\u1810-\u1819\u1946-\u194F\u19D0-\u19D9' +
    '\uFF10-\uFF19';

-  // An alpha char is a unicode chars including unicode marks or
-  // letter or char in otherChars range
-  const alpha = unicodeLetters + unicodeAccents + otherChars;
+  // An alpha char is a unicode chars excluding unicode combining marks
+  // but including other chars, a hashtag must start with one of these,
+  // it does not make sense to have a combining mark before a base character.
+  const alpha = unicodeLetters + otherChars;

  // A numeric character is any with the number digit property, or
  // underscore. These characters can be included in hashtags, but a hashtag
@ -212,7 +213,7 @@ function getHashtagRegexStringChars(): Readonly<{

  // Alphanumeric char is any alpha char or a unicode char with decimal
  // number property \p{Nd}
-  const alphanumeric = alpha + numeric;
+  const alphanumeric = alpha + unicodeAccents + numeric;
  const hashChars = '#\\uFF03'; // normal '#' or full-width '#'

  return {
--- a/packages/lexical-selection/src/tests/unit/LexicalSelection.test.tsx
+++ b/packages/lexical-selection/src/tests/unit/LexicalSelection.test.tsx
@ -239,39 +239,6 @@ describe('LexicalSelection tests', () => {
    expect(actualSelection.focusOffset).toBe(expectedSelection.focusOffset);
  }

-  // eslint-disable-next-line @typescript-eslint/no-unused-vars
-  const GRAPHEME_SCENARIOS = [
-    {
-      description: 'grapheme cluster',
-      // Hangul grapheme cluster.
-      // https://www.compart.com/en/unicode/U+AC01
-      grapheme: '\u1100\u1161\u11A8',
-    },
-    {
-      description: 'extended grapheme cluster',
-      // Tamil 'ni' grapheme cluster.
-      // http://unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters
-      grapheme: '\u0BA8\u0BBF',
-    },
-    {
-      description: 'tailored grapheme cluster',
-      // Devangari 'kshi' tailored grapheme cluster.
-      // http://unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters
-      grapheme: '\u0915\u094D\u0937\u093F',
-    },
-    {
-      description: 'Emoji sequence combined using zero-width joiners',
-      // https://emojipedia.org/family-woman-woman-girl-boy/
-      grapheme:
-        '\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66',
-    },
-    {
-      description: 'Emoji sequence with skin-tone modifier',
-      // https://emojipedia.org/clapping-hands-medium-skin-tone/
-      grapheme: '\uD83D\uDC4F\uD83C\uDFFD',
-    },
-  ];
-
  const suite = [
    {
      expectedHTML:
@ -735,64 +702,6 @@ describe('LexicalSelection tests', () => {
    },

    // Tests need fixing:
-    // ...GRAPHEME_SCENARIOS.flatMap(({description, grapheme}) => [
-    //   {
-    //     name: `Delete backward eliminates entire ${description} (${grapheme})`,
-    //     inputs: [insertText(grapheme + grapheme), deleteBackward(1)],
-    //     expectedHTML: `<div contenteditable="true" style="user-select: text; white-space: pre-wrap; word-break: break-word;" data-lexical-editor="true"><p dir=\"ltr\"><span>${grapheme}</span></p></div>`,
-    //     expectedSelection: {
-    //       anchorPath: [0, 0, 0],
-    //       anchorOffset: grapheme.length,
-    //       focusPath: [0, 0, 0],
-    //       focusOffset: grapheme.length,
-    //     },
-    //     setup: emptySetup,
-    //   },
-    //   {
-    //     name: `Delete forward eliminates entire ${description} (${grapheme})`,
-    //     inputs: [
-    //       insertText(grapheme + grapheme),
-    //       moveNativeSelection([0, 0, 0], 0, [0, 0, 0], 0),
-    //       deleteForward(),
-    //     ],
-    //     expectedHTML: `<div contenteditable="true" style="user-select: text; white-space: pre-wrap; word-break: break-word;" data-lexical-editor="true"><p dir=\"ltr\"><span>${grapheme}</span></p></div>`,
-    //     expectedSelection: {
-    //       anchorPath: [0, 0, 0],
-    //       anchorOffset: 0,
-    //       focusPath: [0, 0, 0],
-    //       focusOffset: 0,
-    //     },
-    //     setup: emptySetup,
-    //   },
-    //   {
-    //     name: `Move backward skips over grapheme cluster (${grapheme})`,
-    //     inputs: [insertText(grapheme + grapheme), moveBackward(1)],
-    //     expectedHTML: `<div contenteditable="true" style="user-select: text; white-space: pre-wrap; word-break: break-word;" data-lexical-editor="true"><p dir=\"ltr\"><span>${grapheme}${grapheme}</span></p></div>`,
-    //     expectedSelection: {
-    //       anchorPath: [0, 0, 0],
-    //       anchorOffset: grapheme.length,
-    //       focusPath: [0, 0, 0],
-    //       focusOffset: grapheme.length,
-    //     },
-    //     setup: emptySetup,
-    //   },
-    //   {
-    //     name: `Move forward skips over grapheme cluster (${grapheme})`,
-    //     inputs: [
-    //       insertText(grapheme + grapheme),
-    //       moveNativeSelection([0, 0, 0], 0, [0, 0, 0], 0),
-    //       moveForward(),
-    //     ],
-    //     expectedHTML: `<div contenteditable="true" style="user-select: text; white-space: pre-wrap; word-break: break-word;" data-lexical-editor="true"><p dir=\"ltr\"><span>${grapheme}${grapheme}</span></p></div>`,
-    //     expectedSelection: {
-    //       anchorPath: [0, 0, 0],
-    //       anchorOffset: grapheme.length,
-    //       focusPath: [0, 0, 0],
-    //       focusOffset: grapheme.length,
-    //     },
-    //     setup: emptySetup,
-    //   },
-    // ]),
    // {
    //   name: 'Jump to beginning and insert',
    //   inputs: [
--- a/packages/lexical/src/LexicalEvents.ts
+++ b/packages/lexical/src/LexicalEvents.ts
@ -89,7 +89,7 @@ import {
  $updateSelectedTextFromDOM,
  $updateTextNodeFromDOMContent,
  dispatchCommand,
-  doesContainGrapheme,
+  doesContainSurrogatePair,
  getAnchorTextFromDOM,
  getDOMSelection,
  getDOMSelectionFromTarget,
@ -224,7 +224,9 @@ function $shouldPreventDefaultAndInsertText(
        // a dangling `input` event caused by execCommand('insertText').
        lastBeforeInputInsertTextTimeStamp < timeStamp + 50)) ||
      (anchorNode.isDirty() && textLength < 2) ||
-      doesContainGrapheme(text)) &&
+      // TODO consider if there are other scenarios when multiple code units
+      //      should be addressed here
+      doesContainSurrogatePair(text)) &&
      anchor.offset !== focus.offset &&
      !anchorNode.isComposing()) ||
    // Any non standard text node.
--- a/packages/lexical/src/LexicalSelection.ts
+++ b/packages/lexical/src/LexicalSelection.ts
@ -58,7 +58,7 @@ import {
  $hasAncestor,
  $isTokenOrSegmented,
  $setCompositionKey,
-  doesContainGrapheme,
+  doesContainSurrogatePair,
  getDOMSelection,
  getDOMTextNode,
  getElementByKeyOrThrow,
@ -2014,6 +2014,70 @@ function moveNativeSelection(
  domSelection.modify(alter, direction, granularity);
 }

+/**
+ * Called by `RangeSelection.deleteCharacter` to determine if
+ * `this.modify('extend', isBackward, 'character')` extended the selection
+ * further than a user would expect for that operation.
+ *
+ * A short(?) JavaScript string vs. Unicode primer:
+ *
+ * Strings in JavaScript use an UTF-16 encoding, and the offsets into a
+ * string are based on those UTF-16 *code units*. This is basically a
+ * historical mistake (though logical at that time, decades ago), but
+ * can never really be fixed for compatibility reasons.
+ *
+ * In Unicode, a *code point* is the combination of one or more *code units*.
+ * and the range of a *code point* can fit into 21 bits.
+ *
+ * Every valid *code point* can be represented with one or two
+ * *UTF-16 code units*. One unit is used when the code point is in the
+ * Basic Multilingual Plane (BMP) and is `< 0xFFFF`. Anything outside
+ * of that plane is encoded with a *surrogate pair* of *code units* and
+ * `/[\uD800-\uDBFF][\uDC00-\uDFFF]/` is a regex that you could use to
+ * find any valid *surrogate pair*. As far as Unicode is concerned, these
+ * pairs represent a single *code point*, but in JavaScript, these pairs
+ * have a length of 2 (`pair.charCodeAt(n)` is really returning a
+ * UTF-16 *code unit*, not a unicode *code point*). It is possible to request
+ * a *code point* with `pair.codePointAt(0)` and enumerate code points
+ * in a string with `[...string]` but the offsets we work with, and
+ * the string length, are based in *code units* so that functionality
+ * is unfortunately not very useful here.
+ *
+ * This only gets us as far as *code points*. We now know that we must
+ * consider that each *code point* can have a length of 1 or 2 in JavaScript
+ * string distance. It gets even trickier because the visual representation
+ * of a character is a *grapheme* (approximately what the user thinks of
+ * as a character). A *grapheme* is one or more *code points*, and can
+ * essentially be arbitrarily long, as there are many ways to combine
+ * them.
+ *
+ * The `this.modify(…)` call has already extended our selection by one
+ * *grapheme* in the direction we want to delete. Sounds great, it's done
+ * a lot of awfully tricky work for us because this functionality has only
+ * recently become available in JavaScript via `Intl.Segmenter`. The
+ * problem is that in many cases the expected behavior of backspace or
+ * delete is *not always to delete a whole grapheme*. In some languages
+ * it's always expected that backspace ought to delete one code point, not the
+ * whole grapheme. In other situations such as emoji that use variation
+ * selectors you *do* want to delete the whole *grapheme*.
+ *
+ * In a few situations the behavior is even application dependent, such as
+ * with latin languages where you have multiple ways to represent the same
+ * character visually (e.g. a letter with an accent in one code point, or a
+ * letter followed by a combining mark in a second code point); some apps will
+ * delete the whole grapheme and others will delete only the combining mark,
+ * probably based on whether they perform some sort of *normalization* on their
+ * input to ensure that only one form is used when two sequences of code points
+ * can represent the same visual character. Lexical currently chooses not
+ * to perform any normalization so this type of combining marks will be
+ * deleted as a *code point* without deleting the whole *grapheme*.
+ *
+ * See also:
+ * https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-2/#G25564
+ * https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G30602
+ * https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G49537
+ * https://mathiasbynens.be/notes/javascript-unicode
+ */
 function $updateCaretSelectionForUnicodeCharacter(
  selection: RangeSelection,
  isBackward: boolean,
@ -2038,7 +2102,7 @@ function $updateCaretSelectionForUnicodeCharacter(

    if (startOffset !== characterOffset) {
      const text = anchorNode.getTextContent().slice(startOffset, endOffset);
-      if (!doesContainGrapheme(text)) {
+      if (shouldDeleteExactlyOneCodeUnit(text)) {
        if (isBackward) {
          focus.set(focus.key, characterOffset, focus.type);
        } else {
@ -2046,11 +2110,57 @@ function $updateCaretSelectionForUnicodeCharacter(
        }
      }
    }
-  } else {
-    // TODO Handling of multibyte characters
  }
 }

+function shouldDeleteExactlyOneCodeUnit(text: string) {
+  if (__DEV__) {
+    invariant(
+      text.length > 1,
+      'shouldDeleteExactlyOneCodeUnit: expecting to be called only with sequences of two or more code units',
+    );
+  }
+  return !(doesContainSurrogatePair(text) || doesContainEmoji(text));
+}
+
+/**
+ * Given the wall of text in $updateCaretSelectionForUnicodeCharacter, you'd
+ * think that the solution might be complex, but the only currently known
+ * cases given the above constraints where we want to delete a whole grapheme
+ * are when emoji is involved. Since ES6 we can use unicode character classes
+ * in regexp which makes this simple.
+ *
+ * It may make sense to add to this heuristic in the future if other
+ * edge cases are discovered, which is why detailed notes remain.
+ *
+ * This is implemented with runtime feature detection and will always
+ * return false on pre-2020 platforms that do not have unicode character
+ * class support.
+ */
+const doesContainEmoji: (text: string) => boolean = (() => {
+  try {
+    const re = new RegExp('\\p{Emoji}', 'u');
+    const test = re.test.bind(re);
+    // Sanity check a few emoji to make sure the regexp was parsed
+    // and works correctly. Any one of these should be sufficient,
+    // but they're cheap and it only runs once.
+    if (
+      // Emoji in the BMP (heart) with variation selector
+      test('\u2764\ufe0f') &&
+      // Emoji in the BMP (#) with variation selector
+      test('#\ufe0f\u20e3') &&
+      // Emoji outside the BMP (thumbs up) that is encoded with a surrogate pair
+      test('\ud83d\udc4d')
+    ) {
+      return test;
+    }
+  } catch (e) {
+    // SyntaxError
+  }
+  // fallback, surrogate pair already checked
+  return () => false;
+})();
+
 function $removeSegment(
  node: TextNode,
  isBackward: boolean,
--- a/packages/lexical/src/LexicalUtils.ts
+++ b/packages/lexical/src/LexicalUtils.ts
@ -629,7 +629,13 @@ function getNodeKeyFromDOMTree(
  return null;
 }

-export function doesContainGrapheme(str: string): boolean {
+/**
+ * Return true if `str` contains any valid surrogate pair.
+ *
+ * See also $updateCaretSelectionForUnicodeCharacter for
+ * a discussion on when and why this is useful.
+ */
+export function doesContainSurrogatePair(str: string): boolean {
  return /[\uD800-\uDBFF][\uDC00-\uDFFF]/g.test(str);
 }