mirror of
https://github.com/facebook/lexical.git
synced 2025-05-20 00:26:38 +08:00
[lexical] Bug Fix: Workaround for delete character with emoji grapheme customers that do not include non-BMP code points (#7175)
This commit is contained in:
@ -0,0 +1,246 @@
|
|||||||
|
/**
|
||||||
|
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
*
|
||||||
|
* This source code is licensed under the MIT license found in the
|
||||||
|
* LICENSE file in the root directory of this source tree.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
import {
|
||||||
|
moveLeft,
|
||||||
|
moveRight,
|
||||||
|
pressBackspace,
|
||||||
|
selectAll,
|
||||||
|
} from '../keyboardShortcuts/index.mjs';
|
||||||
|
import {
|
||||||
|
assertHTML,
|
||||||
|
assertSelection,
|
||||||
|
focusEditor,
|
||||||
|
html,
|
||||||
|
initialize,
|
||||||
|
test,
|
||||||
|
} from '../utils/index.mjs';
|
||||||
|
|
||||||
|
// Regression tests for #7163
|
||||||
|
test.describe.configure({mode: 'parallel'});
|
||||||
|
test.describe('Grapheme deleteCharacter', () => {
|
||||||
|
test.beforeEach(({isPlainText, isCollab, page}) =>
|
||||||
|
initialize({isCollab, isPlainText, page}),
|
||||||
|
);
|
||||||
|
[
|
||||||
|
// Original scenarios
|
||||||
|
{
|
||||||
|
backspaceCount: 3,
|
||||||
|
caretDistance: 1,
|
||||||
|
description: 'grapheme cluster',
|
||||||
|
// Hangul grapheme cluster.
|
||||||
|
// https://www.compart.com/en/unicode/U+AC01
|
||||||
|
grapheme: '\u1100\u1161\u11A8',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
backspaceCount: 2,
|
||||||
|
caretDistance: 1,
|
||||||
|
description: 'extended grapheme cluster',
|
||||||
|
// Tamil 'ni' grapheme cluster.
|
||||||
|
// http://unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters
|
||||||
|
grapheme: '\u0BA8\u0BBF',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
backspaceCount: 4,
|
||||||
|
caretDistance: 1,
|
||||||
|
description: 'tailored grapheme cluster',
|
||||||
|
|
||||||
|
// Unclear why Firefox behaves differently here
|
||||||
|
firefoxCaretDistance: 2,
|
||||||
|
|
||||||
|
// Devangari 'kshi' tailored grapheme cluster.
|
||||||
|
// http://unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters
|
||||||
|
grapheme: '\u0915\u094D\u0937\u093F',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
backspaceCount: 1,
|
||||||
|
caretDistance: 1,
|
||||||
|
description: 'Emoji sequence combined using zero-width joiners',
|
||||||
|
// https://emojipedia.org/family-woman-woman-girl-boy/
|
||||||
|
grapheme:
|
||||||
|
'\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
backspaceCount: 1,
|
||||||
|
caretDistance: 1,
|
||||||
|
description: 'Emoji sequence with skin-tone modifier',
|
||||||
|
// https://emojipedia.org/clapping-hands-medium-skin-tone/
|
||||||
|
grapheme: '\uD83D\uDC4F\uD83C\uDFFD',
|
||||||
|
},
|
||||||
|
// New ones
|
||||||
|
{
|
||||||
|
backspaceCount: 2,
|
||||||
|
caretDistance: 1,
|
||||||
|
description: 'Arabic text with accent',
|
||||||
|
dir: 'rtl',
|
||||||
|
grapheme: '\u0647\u064e',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Editors that do normalization would do this in 1 backspace
|
||||||
|
backspaceCount: 2,
|
||||||
|
caretDistance: 1,
|
||||||
|
description: 'Latin with decomposed combining character',
|
||||||
|
grapheme: 'n\u0303',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
backspaceCount: 1,
|
||||||
|
caretDistance: 1,
|
||||||
|
description:
|
||||||
|
'Multiple codepoint Emoji with variation selector entirely in the BMP',
|
||||||
|
grapheme: '\u2764\ufe0f',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
backspaceCount: 1,
|
||||||
|
caretDistance: 1,
|
||||||
|
description: 'Multiple codepoint keycap Emoji',
|
||||||
|
grapheme: '#\ufe0f\u20e3',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
backspaceCount: 8,
|
||||||
|
caretDistance: 4,
|
||||||
|
description: 'Hindi',
|
||||||
|
// Unclear why this differs
|
||||||
|
firefoxCaretDistance: 5,
|
||||||
|
|
||||||
|
grapheme: '\u0905\u0928\u0941\u091a\u094d\u091b\u0947\u0926',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
backspaceCount: 4,
|
||||||
|
caretDistance: 2,
|
||||||
|
description: 'Korean',
|
||||||
|
grapheme: '\u1103\u1167\u1109\u1170',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
backspaceCount: 5,
|
||||||
|
caretDistance: 5,
|
||||||
|
description: 'Emojis outside the BMP',
|
||||||
|
grapheme: '\ud83c\udf37\ud83c\udf81\ud83d\udca9\ud83d\ude1c\ud83d\udc4d',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
backspaceCount: 1,
|
||||||
|
caretDistance: 1,
|
||||||
|
description:
|
||||||
|
'ZWJ emoji cluster with variation selection that looks like 4 characters but treated as one',
|
||||||
|
grapheme:
|
||||||
|
'\ud83d\udc69\ud83c\udffd\u200d\ud83d\udc68\ud83c\udffd\u200d\ud83d\udc76\ud83c\udffd\u200d\ud83d\udc66\ud83c\udffd',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
backspaceCount: 1,
|
||||||
|
caretDistance: 1,
|
||||||
|
description: 'Flag emoji with ZWJ and variation selector',
|
||||||
|
grapheme: '\ud83c\udff3\ufe0f\u200d\ud83c\udf08',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
backspaceCount: 1,
|
||||||
|
caretDistance: 1,
|
||||||
|
description: 'Chinese for seaborgium (surrogate pair)',
|
||||||
|
grapheme: '\ud862\udf4e',
|
||||||
|
},
|
||||||
|
].forEach(
|
||||||
|
({
|
||||||
|
backspaceCount,
|
||||||
|
caretDistance,
|
||||||
|
description,
|
||||||
|
grapheme,
|
||||||
|
dir = 'ltr',
|
||||||
|
skip = false,
|
||||||
|
firefoxCaretDistance = undefined,
|
||||||
|
}) => {
|
||||||
|
test(description, async ({page, browserName, isCollab, isPlainText}) => {
|
||||||
|
// We are only concerned about input here, not collab.
|
||||||
|
test.skip(isCollab || skip);
|
||||||
|
|
||||||
|
// You can render a grapheme with escape sequences like this:
|
||||||
|
// const fmt = (s) => "'" + Array.from({ length: s.length }, (_, i) => `\\u${s.charCodeAt(i).toString(16).padStart(4, '0')}`).join('') + "'";
|
||||||
|
// e.g. from dev tools copy(fmt('emoji here')) and then paste it in your text editor
|
||||||
|
await focusEditor(page);
|
||||||
|
const codeUnits = grapheme.length;
|
||||||
|
await page.keyboard.type(description);
|
||||||
|
await page.keyboard.press('Enter');
|
||||||
|
const expectedInitialHTML = isPlainText
|
||||||
|
? html`
|
||||||
|
<p dir="ltr">
|
||||||
|
<span data-lexical-text="true">${description}</span>
|
||||||
|
<br />
|
||||||
|
<br />
|
||||||
|
</p>
|
||||||
|
`
|
||||||
|
: html`
|
||||||
|
<p dir="ltr">
|
||||||
|
<span data-lexical-text="true">${description}</span>
|
||||||
|
</p>
|
||||||
|
<p><br /></p>
|
||||||
|
`;
|
||||||
|
const expectedGraphemeHTML = isPlainText
|
||||||
|
? html`
|
||||||
|
<p dir="ltr">
|
||||||
|
<span data-lexical-text="true">${description}</span>
|
||||||
|
<br />
|
||||||
|
<span data-lexical-text="true">${grapheme}</span>
|
||||||
|
</p>
|
||||||
|
`
|
||||||
|
: html`
|
||||||
|
<p dir="ltr">
|
||||||
|
<span data-lexical-text="true">${description}</span>
|
||||||
|
</p>
|
||||||
|
<p dir="${dir}">
|
||||||
|
<span data-lexical-text="true">${grapheme}</span>
|
||||||
|
</p>
|
||||||
|
`;
|
||||||
|
await assertHTML(page, expectedInitialHTML, undefined, {
|
||||||
|
ignoreClasses: true,
|
||||||
|
ignoreInlineStyles: true,
|
||||||
|
});
|
||||||
|
await page.keyboard.insertText(grapheme);
|
||||||
|
await assertHTML(page, expectedGraphemeHTML, undefined, {
|
||||||
|
ignoreClasses: true,
|
||||||
|
ignoreInlineStyles: true,
|
||||||
|
});
|
||||||
|
const selectionFromOffset = (offset, path) => ({
|
||||||
|
anchorOffset: offset,
|
||||||
|
anchorPath: path,
|
||||||
|
focusOffset: offset,
|
||||||
|
focusPath: path,
|
||||||
|
});
|
||||||
|
const graphemePath = isPlainText ? [0, 2, 0] : [1, 0, 0];
|
||||||
|
const initialPath = isPlainText ? [0] : [1];
|
||||||
|
const initialOffset = isPlainText ? 2 : 0;
|
||||||
|
await assertSelection(
|
||||||
|
page,
|
||||||
|
selectionFromOffset(codeUnits, graphemePath),
|
||||||
|
);
|
||||||
|
if (dir !== 'rtl') {
|
||||||
|
// It's unclear why Firefox navigates differently in these specific,
|
||||||
|
// cases, but that's not pertinent to what we're testing
|
||||||
|
const testedCaretDistance =
|
||||||
|
browserName === 'firefox' && firefoxCaretDistance !== undefined
|
||||||
|
? firefoxCaretDistance
|
||||||
|
: caretDistance;
|
||||||
|
await moveLeft(page, testedCaretDistance);
|
||||||
|
await assertSelection(page, selectionFromOffset(0, graphemePath));
|
||||||
|
await moveRight(page, testedCaretDistance);
|
||||||
|
}
|
||||||
|
await assertSelection(
|
||||||
|
page,
|
||||||
|
selectionFromOffset(codeUnits, graphemePath),
|
||||||
|
);
|
||||||
|
await pressBackspace(page, backspaceCount);
|
||||||
|
await assertSelection(
|
||||||
|
page,
|
||||||
|
selectionFromOffset(initialOffset, initialPath),
|
||||||
|
);
|
||||||
|
await assertHTML(page, expectedInitialHTML, undefined, {
|
||||||
|
ignoreClasses: true,
|
||||||
|
ignoreInlineStyles: true,
|
||||||
|
});
|
||||||
|
await selectAll(page);
|
||||||
|
await pressBackspace(page);
|
||||||
|
});
|
||||||
|
},
|
||||||
|
);
|
||||||
|
});
|
@ -201,9 +201,10 @@ function getHashtagRegexStringChars(): Readonly<{
|
|||||||
'\u1040-\u1049\u17E0-\u17E9\u1810-\u1819\u1946-\u194F\u19D0-\u19D9' +
|
'\u1040-\u1049\u17E0-\u17E9\u1810-\u1819\u1946-\u194F\u19D0-\u19D9' +
|
||||||
'\uFF10-\uFF19';
|
'\uFF10-\uFF19';
|
||||||
|
|
||||||
// An alpha char is a unicode chars including unicode marks or
|
// An alpha char is a unicode chars excluding unicode combining marks
|
||||||
// letter or char in otherChars range
|
// but including other chars, a hashtag must start with one of these,
|
||||||
const alpha = unicodeLetters + unicodeAccents + otherChars;
|
// it does not make sense to have a combining mark before a base character.
|
||||||
|
const alpha = unicodeLetters + otherChars;
|
||||||
|
|
||||||
// A numeric character is any with the number digit property, or
|
// A numeric character is any with the number digit property, or
|
||||||
// underscore. These characters can be included in hashtags, but a hashtag
|
// underscore. These characters can be included in hashtags, but a hashtag
|
||||||
@ -212,7 +213,7 @@ function getHashtagRegexStringChars(): Readonly<{
|
|||||||
|
|
||||||
// Alphanumeric char is any alpha char or a unicode char with decimal
|
// Alphanumeric char is any alpha char or a unicode char with decimal
|
||||||
// number property \p{Nd}
|
// number property \p{Nd}
|
||||||
const alphanumeric = alpha + numeric;
|
const alphanumeric = alpha + unicodeAccents + numeric;
|
||||||
const hashChars = '#\\uFF03'; // normal '#' or full-width '#'
|
const hashChars = '#\\uFF03'; // normal '#' or full-width '#'
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -239,39 +239,6 @@ describe('LexicalSelection tests', () => {
|
|||||||
expect(actualSelection.focusOffset).toBe(expectedSelection.focusOffset);
|
expect(actualSelection.focusOffset).toBe(expectedSelection.focusOffset);
|
||||||
}
|
}
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
||||||
const GRAPHEME_SCENARIOS = [
|
|
||||||
{
|
|
||||||
description: 'grapheme cluster',
|
|
||||||
// Hangul grapheme cluster.
|
|
||||||
// https://www.compart.com/en/unicode/U+AC01
|
|
||||||
grapheme: '\u1100\u1161\u11A8',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
description: 'extended grapheme cluster',
|
|
||||||
// Tamil 'ni' grapheme cluster.
|
|
||||||
// http://unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters
|
|
||||||
grapheme: '\u0BA8\u0BBF',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
description: 'tailored grapheme cluster',
|
|
||||||
// Devangari 'kshi' tailored grapheme cluster.
|
|
||||||
// http://unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters
|
|
||||||
grapheme: '\u0915\u094D\u0937\u093F',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
description: 'Emoji sequence combined using zero-width joiners',
|
|
||||||
// https://emojipedia.org/family-woman-woman-girl-boy/
|
|
||||||
grapheme:
|
|
||||||
'\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
description: 'Emoji sequence with skin-tone modifier',
|
|
||||||
// https://emojipedia.org/clapping-hands-medium-skin-tone/
|
|
||||||
grapheme: '\uD83D\uDC4F\uD83C\uDFFD',
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const suite = [
|
const suite = [
|
||||||
{
|
{
|
||||||
expectedHTML:
|
expectedHTML:
|
||||||
@ -735,64 +702,6 @@ describe('LexicalSelection tests', () => {
|
|||||||
},
|
},
|
||||||
|
|
||||||
// Tests need fixing:
|
// Tests need fixing:
|
||||||
// ...GRAPHEME_SCENARIOS.flatMap(({description, grapheme}) => [
|
|
||||||
// {
|
|
||||||
// name: `Delete backward eliminates entire ${description} (${grapheme})`,
|
|
||||||
// inputs: [insertText(grapheme + grapheme), deleteBackward(1)],
|
|
||||||
// expectedHTML: `<div contenteditable="true" style="user-select: text; white-space: pre-wrap; word-break: break-word;" data-lexical-editor="true"><p dir=\"ltr\"><span>${grapheme}</span></p></div>`,
|
|
||||||
// expectedSelection: {
|
|
||||||
// anchorPath: [0, 0, 0],
|
|
||||||
// anchorOffset: grapheme.length,
|
|
||||||
// focusPath: [0, 0, 0],
|
|
||||||
// focusOffset: grapheme.length,
|
|
||||||
// },
|
|
||||||
// setup: emptySetup,
|
|
||||||
// },
|
|
||||||
// {
|
|
||||||
// name: `Delete forward eliminates entire ${description} (${grapheme})`,
|
|
||||||
// inputs: [
|
|
||||||
// insertText(grapheme + grapheme),
|
|
||||||
// moveNativeSelection([0, 0, 0], 0, [0, 0, 0], 0),
|
|
||||||
// deleteForward(),
|
|
||||||
// ],
|
|
||||||
// expectedHTML: `<div contenteditable="true" style="user-select: text; white-space: pre-wrap; word-break: break-word;" data-lexical-editor="true"><p dir=\"ltr\"><span>${grapheme}</span></p></div>`,
|
|
||||||
// expectedSelection: {
|
|
||||||
// anchorPath: [0, 0, 0],
|
|
||||||
// anchorOffset: 0,
|
|
||||||
// focusPath: [0, 0, 0],
|
|
||||||
// focusOffset: 0,
|
|
||||||
// },
|
|
||||||
// setup: emptySetup,
|
|
||||||
// },
|
|
||||||
// {
|
|
||||||
// name: `Move backward skips over grapheme cluster (${grapheme})`,
|
|
||||||
// inputs: [insertText(grapheme + grapheme), moveBackward(1)],
|
|
||||||
// expectedHTML: `<div contenteditable="true" style="user-select: text; white-space: pre-wrap; word-break: break-word;" data-lexical-editor="true"><p dir=\"ltr\"><span>${grapheme}${grapheme}</span></p></div>`,
|
|
||||||
// expectedSelection: {
|
|
||||||
// anchorPath: [0, 0, 0],
|
|
||||||
// anchorOffset: grapheme.length,
|
|
||||||
// focusPath: [0, 0, 0],
|
|
||||||
// focusOffset: grapheme.length,
|
|
||||||
// },
|
|
||||||
// setup: emptySetup,
|
|
||||||
// },
|
|
||||||
// {
|
|
||||||
// name: `Move forward skips over grapheme cluster (${grapheme})`,
|
|
||||||
// inputs: [
|
|
||||||
// insertText(grapheme + grapheme),
|
|
||||||
// moveNativeSelection([0, 0, 0], 0, [0, 0, 0], 0),
|
|
||||||
// moveForward(),
|
|
||||||
// ],
|
|
||||||
// expectedHTML: `<div contenteditable="true" style="user-select: text; white-space: pre-wrap; word-break: break-word;" data-lexical-editor="true"><p dir=\"ltr\"><span>${grapheme}${grapheme}</span></p></div>`,
|
|
||||||
// expectedSelection: {
|
|
||||||
// anchorPath: [0, 0, 0],
|
|
||||||
// anchorOffset: grapheme.length,
|
|
||||||
// focusPath: [0, 0, 0],
|
|
||||||
// focusOffset: grapheme.length,
|
|
||||||
// },
|
|
||||||
// setup: emptySetup,
|
|
||||||
// },
|
|
||||||
// ]),
|
|
||||||
// {
|
// {
|
||||||
// name: 'Jump to beginning and insert',
|
// name: 'Jump to beginning and insert',
|
||||||
// inputs: [
|
// inputs: [
|
||||||
|
@ -89,7 +89,7 @@ import {
|
|||||||
$updateSelectedTextFromDOM,
|
$updateSelectedTextFromDOM,
|
||||||
$updateTextNodeFromDOMContent,
|
$updateTextNodeFromDOMContent,
|
||||||
dispatchCommand,
|
dispatchCommand,
|
||||||
doesContainGrapheme,
|
doesContainSurrogatePair,
|
||||||
getAnchorTextFromDOM,
|
getAnchorTextFromDOM,
|
||||||
getDOMSelection,
|
getDOMSelection,
|
||||||
getDOMSelectionFromTarget,
|
getDOMSelectionFromTarget,
|
||||||
@ -224,7 +224,9 @@ function $shouldPreventDefaultAndInsertText(
|
|||||||
// a dangling `input` event caused by execCommand('insertText').
|
// a dangling `input` event caused by execCommand('insertText').
|
||||||
lastBeforeInputInsertTextTimeStamp < timeStamp + 50)) ||
|
lastBeforeInputInsertTextTimeStamp < timeStamp + 50)) ||
|
||||||
(anchorNode.isDirty() && textLength < 2) ||
|
(anchorNode.isDirty() && textLength < 2) ||
|
||||||
doesContainGrapheme(text)) &&
|
// TODO consider if there are other scenarios when multiple code units
|
||||||
|
// should be addressed here
|
||||||
|
doesContainSurrogatePair(text)) &&
|
||||||
anchor.offset !== focus.offset &&
|
anchor.offset !== focus.offset &&
|
||||||
!anchorNode.isComposing()) ||
|
!anchorNode.isComposing()) ||
|
||||||
// Any non standard text node.
|
// Any non standard text node.
|
||||||
|
@ -58,7 +58,7 @@ import {
|
|||||||
$hasAncestor,
|
$hasAncestor,
|
||||||
$isTokenOrSegmented,
|
$isTokenOrSegmented,
|
||||||
$setCompositionKey,
|
$setCompositionKey,
|
||||||
doesContainGrapheme,
|
doesContainSurrogatePair,
|
||||||
getDOMSelection,
|
getDOMSelection,
|
||||||
getDOMTextNode,
|
getDOMTextNode,
|
||||||
getElementByKeyOrThrow,
|
getElementByKeyOrThrow,
|
||||||
@ -2014,6 +2014,70 @@ function moveNativeSelection(
|
|||||||
domSelection.modify(alter, direction, granularity);
|
domSelection.modify(alter, direction, granularity);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Called by `RangeSelection.deleteCharacter` to determine if
|
||||||
|
* `this.modify('extend', isBackward, 'character')` extended the selection
|
||||||
|
* further than a user would expect for that operation.
|
||||||
|
*
|
||||||
|
* A short(?) JavaScript string vs. Unicode primer:
|
||||||
|
*
|
||||||
|
* Strings in JavaScript use an UTF-16 encoding, and the offsets into a
|
||||||
|
* string are based on those UTF-16 *code units*. This is basically a
|
||||||
|
* historical mistake (though logical at that time, decades ago), but
|
||||||
|
* can never really be fixed for compatibility reasons.
|
||||||
|
*
|
||||||
|
* In Unicode, a *code point* is the combination of one or more *code units*.
|
||||||
|
* and the range of a *code point* can fit into 21 bits.
|
||||||
|
*
|
||||||
|
* Every valid *code point* can be represented with one or two
|
||||||
|
* *UTF-16 code units*. One unit is used when the code point is in the
|
||||||
|
* Basic Multilingual Plane (BMP) and is `< 0xFFFF`. Anything outside
|
||||||
|
* of that plane is encoded with a *surrogate pair* of *code units* and
|
||||||
|
* `/[\uD800-\uDBFF][\uDC00-\uDFFF]/` is a regex that you could use to
|
||||||
|
* find any valid *surrogate pair*. As far as Unicode is concerned, these
|
||||||
|
* pairs represent a single *code point*, but in JavaScript, these pairs
|
||||||
|
* have a length of 2 (`pair.charCodeAt(n)` is really returning a
|
||||||
|
* UTF-16 *code unit*, not a unicode *code point*). It is possible to request
|
||||||
|
* a *code point* with `pair.codePointAt(0)` and enumerate code points
|
||||||
|
* in a string with `[...string]` but the offsets we work with, and
|
||||||
|
* the string length, are based in *code units* so that functionality
|
||||||
|
* is unfortunately not very useful here.
|
||||||
|
*
|
||||||
|
* This only gets us as far as *code points*. We now know that we must
|
||||||
|
* consider that each *code point* can have a length of 1 or 2 in JavaScript
|
||||||
|
* string distance. It gets even trickier because the visual representation
|
||||||
|
* of a character is a *grapheme* (approximately what the user thinks of
|
||||||
|
* as a character). A *grapheme* is one or more *code points*, and can
|
||||||
|
* essentially be arbitrarily long, as there are many ways to combine
|
||||||
|
* them.
|
||||||
|
*
|
||||||
|
* The `this.modify(…)` call has already extended our selection by one
|
||||||
|
* *grapheme* in the direction we want to delete. Sounds great, it's done
|
||||||
|
* a lot of awfully tricky work for us because this functionality has only
|
||||||
|
* recently become available in JavaScript via `Intl.Segmenter`. The
|
||||||
|
* problem is that in many cases the expected behavior of backspace or
|
||||||
|
* delete is *not always to delete a whole grapheme*. In some languages
|
||||||
|
* it's always expected that backspace ought to delete one code point, not the
|
||||||
|
* whole grapheme. In other situations such as emoji that use variation
|
||||||
|
* selectors you *do* want to delete the whole *grapheme*.
|
||||||
|
*
|
||||||
|
* In a few situations the behavior is even application dependent, such as
|
||||||
|
* with latin languages where you have multiple ways to represent the same
|
||||||
|
* character visually (e.g. a letter with an accent in one code point, or a
|
||||||
|
* letter followed by a combining mark in a second code point); some apps will
|
||||||
|
* delete the whole grapheme and others will delete only the combining mark,
|
||||||
|
* probably based on whether they perform some sort of *normalization* on their
|
||||||
|
* input to ensure that only one form is used when two sequences of code points
|
||||||
|
* can represent the same visual character. Lexical currently chooses not
|
||||||
|
* to perform any normalization so this type of combining marks will be
|
||||||
|
* deleted as a *code point* without deleting the whole *grapheme*.
|
||||||
|
*
|
||||||
|
* See also:
|
||||||
|
* https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-2/#G25564
|
||||||
|
* https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G30602
|
||||||
|
* https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G49537
|
||||||
|
* https://mathiasbynens.be/notes/javascript-unicode
|
||||||
|
*/
|
||||||
function $updateCaretSelectionForUnicodeCharacter(
|
function $updateCaretSelectionForUnicodeCharacter(
|
||||||
selection: RangeSelection,
|
selection: RangeSelection,
|
||||||
isBackward: boolean,
|
isBackward: boolean,
|
||||||
@ -2038,7 +2102,7 @@ function $updateCaretSelectionForUnicodeCharacter(
|
|||||||
|
|
||||||
if (startOffset !== characterOffset) {
|
if (startOffset !== characterOffset) {
|
||||||
const text = anchorNode.getTextContent().slice(startOffset, endOffset);
|
const text = anchorNode.getTextContent().slice(startOffset, endOffset);
|
||||||
if (!doesContainGrapheme(text)) {
|
if (shouldDeleteExactlyOneCodeUnit(text)) {
|
||||||
if (isBackward) {
|
if (isBackward) {
|
||||||
focus.set(focus.key, characterOffset, focus.type);
|
focus.set(focus.key, characterOffset, focus.type);
|
||||||
} else {
|
} else {
|
||||||
@ -2046,11 +2110,57 @@ function $updateCaretSelectionForUnicodeCharacter(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// TODO Handling of multibyte characters
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function shouldDeleteExactlyOneCodeUnit(text: string) {
|
||||||
|
if (__DEV__) {
|
||||||
|
invariant(
|
||||||
|
text.length > 1,
|
||||||
|
'shouldDeleteExactlyOneCodeUnit: expecting to be called only with sequences of two or more code units',
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return !(doesContainSurrogatePair(text) || doesContainEmoji(text));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given the wall of text in $updateCaretSelectionForUnicodeCharacter, you'd
|
||||||
|
* think that the solution might be complex, but the only currently known
|
||||||
|
* cases given the above constraints where we want to delete a whole grapheme
|
||||||
|
* are when emoji is involved. Since ES6 we can use unicode character classes
|
||||||
|
* in regexp which makes this simple.
|
||||||
|
*
|
||||||
|
* It may make sense to add to this heuristic in the future if other
|
||||||
|
* edge cases are discovered, which is why detailed notes remain.
|
||||||
|
*
|
||||||
|
* This is implemented with runtime feature detection and will always
|
||||||
|
* return false on pre-2020 platforms that do not have unicode character
|
||||||
|
* class support.
|
||||||
|
*/
|
||||||
|
const doesContainEmoji: (text: string) => boolean = (() => {
|
||||||
|
try {
|
||||||
|
const re = new RegExp('\\p{Emoji}', 'u');
|
||||||
|
const test = re.test.bind(re);
|
||||||
|
// Sanity check a few emoji to make sure the regexp was parsed
|
||||||
|
// and works correctly. Any one of these should be sufficient,
|
||||||
|
// but they're cheap and it only runs once.
|
||||||
|
if (
|
||||||
|
// Emoji in the BMP (heart) with variation selector
|
||||||
|
test('\u2764\ufe0f') &&
|
||||||
|
// Emoji in the BMP (#) with variation selector
|
||||||
|
test('#\ufe0f\u20e3') &&
|
||||||
|
// Emoji outside the BMP (thumbs up) that is encoded with a surrogate pair
|
||||||
|
test('\ud83d\udc4d')
|
||||||
|
) {
|
||||||
|
return test;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// SyntaxError
|
||||||
|
}
|
||||||
|
// fallback, surrogate pair already checked
|
||||||
|
return () => false;
|
||||||
|
})();
|
||||||
|
|
||||||
function $removeSegment(
|
function $removeSegment(
|
||||||
node: TextNode,
|
node: TextNode,
|
||||||
isBackward: boolean,
|
isBackward: boolean,
|
||||||
|
@ -629,7 +629,13 @@ function getNodeKeyFromDOMTree(
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function doesContainGrapheme(str: string): boolean {
|
/**
|
||||||
|
* Return true if `str` contains any valid surrogate pair.
|
||||||
|
*
|
||||||
|
* See also $updateCaretSelectionForUnicodeCharacter for
|
||||||
|
* a discussion on when and why this is useful.
|
||||||
|
*/
|
||||||
|
export function doesContainSurrogatePair(str: string): boolean {
|
||||||
return /[\uD800-\uDBFF][\uDC00-\uDFFF]/g.test(str);
|
return /[\uD800-\uDBFF][\uDC00-\uDFFF]/g.test(str);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user