Allow non-ASCII characters in Rust identifiers

Rust 1.53 (quite a while ago now) ungated the support for non-ASCII
identifiers.  This didn't work in gdb.  This is PR rust/20166.

This patch fixes the problem by allowing non-ASCII characters to be
considered as identifier components.  It seemed simplest to just pass
them through -- doing any extra checking didn't seem worthwhile.

The new test also verifies that such characters are allowed in strings
and character literals as well.  The latter also required a bit of
work in the lexer.

Bug: https://sourceware.org/bugzilla/show_bug.cgi?id=20166
This commit is contained in:
Tom Tromey
2022-01-26 15:39:03 -07:00
parent c1f5e54825
commit a723766c0e
3 changed files with 129 additions and 18 deletions

View File

@ -33,6 +33,12 @@
using namespace expr;
#if WORDS_BIGENDIAN
#define UTF32 "UTF-32BE"
#else
#define UTF32 "UTF-32LE"
#endif
/* A regular expression for matching Rust numbers. This is split up
since it is very long and this gives us a way to comment the
sections. */
@ -577,6 +583,35 @@ rust_parser::lex_escape (int is_byte)
return result;
}
/* A helper for lex_character. Search forward for the closing single
quote, then convert the bytes from the host charset to UTF-32. */
static uint32_t
lex_multibyte_char (const char *text, int *len)
{
/* Only look a maximum of 5 bytes for the closing quote. This is
the maximum for UTF-8. */
int quote;
gdb_assert (text[0] != '\'');
for (quote = 1; text[quote] != '\0' && text[quote] != '\''; ++quote)
;
*len = quote;
/* The caller will issue an error. */
if (text[quote] == '\0')
return 0;
auto_obstack result;
convert_between_encodings (host_charset (), UTF32, (const gdb_byte *) text,
quote, 1, &result, translit_none);
int size = obstack_object_size (&result);
if (size > 4)
error (_("overlong character literal"));
uint32_t value;
memcpy (&value, obstack_finish (&result), size);
return value;
}
/* Lex a character constant. */
int
@ -592,13 +627,15 @@ rust_parser::lex_character ()
}
gdb_assert (pstate->lexptr[0] == '\'');
++pstate->lexptr;
/* This should handle UTF-8 here. */
if (pstate->lexptr[0] == '\\')
if (pstate->lexptr[0] == '\'')
error (_("empty character literal"));
else if (pstate->lexptr[0] == '\\')
value = lex_escape (is_byte);
else
{
value = pstate->lexptr[0] & 0xff;
++pstate->lexptr;
int len;
value = lex_multibyte_char (&pstate->lexptr[0], &len);
pstate->lexptr += len;
}
if (pstate->lexptr[0] != '\'')
@ -695,16 +732,9 @@ rust_parser::lex_string ()
if (is_byte)
obstack_1grow (&obstack, value);
else
{
#if WORDS_BIGENDIAN
#define UTF32 "UTF-32BE"
#else
#define UTF32 "UTF-32LE"
#endif
convert_between_encodings (UTF32, "UTF-8", (gdb_byte *) &value,
sizeof (value), sizeof (value),
&obstack, translit_none);
}
convert_between_encodings (UTF32, "UTF-8", (gdb_byte *) &value,
sizeof (value), sizeof (value),
&obstack, translit_none);
}
else if (pstate->lexptr[0] == '\0')
error (_("Unexpected EOF in string"));
@ -746,7 +776,10 @@ rust_identifier_start_p (char c)
return ((c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| c == '_'
|| c == '$');
|| c == '$'
/* Allow any non-ASCII character as an identifier. There
doesn't seem to be a need to be picky about this. */
|| (c & 0x80) != 0);
}
/* Lex an identifier. */
@ -772,13 +805,14 @@ rust_parser::lex_identifier ()
++pstate->lexptr;
/* For the time being this doesn't handle Unicode rules. Non-ASCII
identifiers are gated anyway. */
/* Allow any non-ASCII character here. This "handles" UTF-8 by
passing it through. */
while ((pstate->lexptr[0] >= 'a' && pstate->lexptr[0] <= 'z')
|| (pstate->lexptr[0] >= 'A' && pstate->lexptr[0] <= 'Z')
|| pstate->lexptr[0] == '_'
|| (is_gdb_var && pstate->lexptr[0] == '$')
|| (pstate->lexptr[0] >= '0' && pstate->lexptr[0] <= '9'))
|| (pstate->lexptr[0] >= '0' && pstate->lexptr[0] <= '9')
|| (pstate->lexptr[0] & 0x80) != 0)
++pstate->lexptr;

View File

@ -0,0 +1,51 @@
# Copyright (C) 2022 Free Software Foundation, Inc.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Test raw identifiers.
load_lib rust-support.exp
if {[skip_rust_tests]} {
continue
}
# Non-ASCII identifiers were allowed starting in 1.53.
set v [split [rust_compiler_version] .]
if {[lindex $v 0] == 1 && [lindex $v 1] < 53} {
untested "this test requires rust 1.53 or greater"
return -1
}
# Enable basic use of UTF-8. LC_ALL gets reset for each testfile.
setenv LC_ALL C.UTF-8
standard_testfile .rs
if {[prepare_for_testing "failed to prepare" $testfile $srcfile {debug rust}]} {
return -1
}
set line [gdb_get_line_number "set breakpoint here"]
if {![runto ${srcfile}:$line]} {
untested "could not run to breakpoint"
return -1
}
gdb_test "print 𝕯" " = 98" "print D"
gdb_test "print \"𝕯\"" " = \"𝕯\"" "print D in string"
# This output is maybe not ideal, but it also isn't incorrect.
gdb_test "print '𝕯'" " = 120175 '\\\\u\\\{01d56f\\\}'" \
"print D as char"
gdb_test "print cç" " = 97" "print cc"
gdb_test "print 'çc'" "overlong character literal" "print cc as char"

View File

@ -0,0 +1,26 @@
// Copyright (C) 2022 Free Software Foundation, Inc.
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
#![allow(dead_code)]
#![allow(unused_variables)]
#![allow(unused_assignments)]
#![allow(uncommon_codepoints)]
#![allow(non_snake_case)]
fn main() {
let 𝕯 = 98;
let = 97;
println!("{}, {}", 𝕯, ); // set breakpoint here
}