Add multibyte character warning option to the assembler.

* as.c (parse_args): Add support for --multibyte-handling.
	* as.h (multibyte_handling): Declare.
	* app.c (scan_for_multibyte_characters): New function.
	(do_scrub_chars): Call the new function if multibyte warning is
	enabled.
	* input-scrub,c (input_scrub_next_buffer): Call the multibyte
	scanning function if multibyte warnings are enabled.
	* symbols.c (struct symbol_flags): Add multibyte_warned bit.
	(symbol_init): Call the multibyte scanning function if multibyte
	symbol warnings are enabled.
	(S_SET_SEGMENT): Likewise.
	* NEWS: Mention the new feature.
	* doc/as.texi: Document the new feature.
	* testsuite/gas/all/multibyte.s: New test source file.
	* testsuite/gas/all/multibyte1.d: New test driver file.
	* testsuite/gas/all/multibyte1.l: New test expected output.
	* testsuite/gas/all/multibyte2.d: New test driver file.
	* testsuite/gas/all/multibyte2.l: New test expected output.
	* testsuite/gas/all/gas.exp: Run the new tests.
This commit is contained in:
Nick Clifton
2021-11-18 16:48:19 +00:00
parent 76eb8ef1ce
commit 578c64a45a
14 changed files with 205 additions and 10 deletions

View File

@ -1,3 +1,25 @@
2021-11-18 Nick Clifton <nickc@redhat.com>
* as.c (parse_args): Add support for --multibyte-handling.
* as.h (multibyte_handling): Declare.
* app.c (scan_for_multibyte_characters): New function.
(do_scrub_chars): Call the new function if multibyte warning is
enabled.
* input-scrub,c (input_scrub_next_buffer): Call the multibyte
scanning function if multibyte warnings are enabled.
* symbols.c (struct symbol_flags): Add multibyte_warned bit.
(symbol_init): Call the multibyte scanning function if multibyte
symbol warnings are enabled.
(S_SET_SEGMENT): Likewise.
* NEWS: Mention the new feature.
* doc/as.texi: Document the new feature.
* testsuite/gas/all/multibyte.s: New test source file.
* testsuite/gas/all/multibyte1.d: New test driver file.
* testsuite/gas/all/multibyte1.l: New test expected output.
* testsuite/gas/all/multibyte2.d: New test driver file.
* testsuite/gas/all/multibyte2.l: New test expected output.
* testsuite/gas/all/gas.exp: Run the new tests.
2021-11-15 Eric Botcazou <ebotcazou@adacore.com>
* doc/as.texi (File): Update description of .file 0 directive.

View File

@ -13,6 +13,14 @@
* Add support for Scalable Matrix Extension (SME) for AArch64.
* The --multibyte-handling=[allow|warn|warn-sym-only] option tells the
assembler what to when it encoutners multibyte characters in the input. The
default is to allow them. Setting the option to "warn" will generate a
warning message whenever any multibyte character is encountered. Using the
option to "warn-sym-only" will make the assembler generate a warning whenever a
symbol is defined containing multibyte characters. (References to undefined
symbols will not generate warnings).
* Outputs of .ds.x directive and .tfloat directive with hex input from
x86 assembler have been reduced from 12 bytes to 10 bytes to match the
output of .tfloat directive.

View File

@ -345,6 +345,55 @@ process_escape (int ch)
}
}
#define MULTIBYTE_WARN_COUNT_LIMIT 10
static unsigned int multibyte_warn_count = 0;
bool
scan_for_multibyte_characters (const unsigned char * start,
const unsigned char * end,
bool warn)
{
if (end <= start)
return false;
if (warn && multibyte_warn_count > MULTIBYTE_WARN_COUNT_LIMIT)
return false;
bool found = false;
while (start < end)
{
unsigned char c;
if ((c = * start++) <= 0x7f)
continue;
if (!warn)
return true;
found = true;
const char * filename;
unsigned int lineno;
filename = as_where (& lineno);
if (filename == NULL)
as_warn (_("multibyte character (%#x) encountered in input"), c);
else if (lineno == 0)
as_warn (_("multibyte character (%#x) encountered in %s"), c, filename);
else
as_warn (_("multibyte character (%#x) encountered in %s at or near line %u"), c, filename, lineno);
if (++ multibyte_warn_count == MULTIBYTE_WARN_COUNT_LIMIT)
{
as_warn (_("further multibyte character warnings suppressed"));
break;
}
}
return found;
}
/* This function is called to process input characters. The GET
parameter is used to retrieve more input characters. GET should
set its parameter to point to a buffer, and return the length of
@ -463,6 +512,11 @@ do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen)
return 0;
from = input_buffer;
fromend = from + fromlen;
if (multibyte_handling == multibyte_warn)
(void) scan_for_multibyte_characters ((const unsigned char *) from,
(const unsigned char* ) fromend,
true /* Generate warnings. */);
}
while (1)

View File

@ -474,7 +474,7 @@ parse_args (int * pargc, char *** pargv)
OPTION_DEBUG_PREFIX_MAP,
OPTION_DEFSYM,
OPTION_LISTING_LHS_WIDTH,
OPTION_LISTING_LHS_WIDTH2,
OPTION_LISTING_LHS_WIDTH2, /* = STD_BASE + 10 */
OPTION_LISTING_RHS_WIDTH,
OPTION_LISTING_CONT_LINES,
OPTION_DEPFILE,
@ -484,7 +484,7 @@ parse_args (int * pargc, char *** pargv)
OPTION_GDWARF_3,
OPTION_GDWARF_4,
OPTION_GDWARF_5,
OPTION_GDWARF_SECTIONS,
OPTION_GDWARF_SECTIONS, /* = STD_BASE + 20 */
OPTION_GDWARF_CIE_VERSION,
OPTION_STRIP_LOCAL_ABSOLUTE,
OPTION_TRADITIONAL_FORMAT,
@ -494,7 +494,7 @@ parse_args (int * pargc, char *** pargv)
OPTION_NOEXECSTACK,
OPTION_SIZE_CHECK,
OPTION_ELF_STT_COMMON,
OPTION_ELF_BUILD_NOTES,
OPTION_ELF_BUILD_NOTES, /* = STD_BASE + 30 */
OPTION_SECTNAME_SUBST,
OPTION_ALTERNATE,
OPTION_AL,
@ -503,7 +503,8 @@ parse_args (int * pargc, char *** pargv)
OPTION_WARN_FATAL,
OPTION_COMPRESS_DEBUG,
OPTION_NOCOMPRESS_DEBUG,
OPTION_NO_PAD_SECTIONS /* = STD_BASE + 40 */
OPTION_NO_PAD_SECTIONS,
OPTION_MULTIBYTE_HANDLING /* = STD_BASE + 40 */
/* When you add options here, check that they do
not collide with OPTION_MD_BASE. See as.h. */
};
@ -581,6 +582,7 @@ parse_args (int * pargc, char *** pargv)
,{"target-help", no_argument, NULL, OPTION_TARGET_HELP}
,{"traditional-format", no_argument, NULL, OPTION_TRADITIONAL_FORMAT}
,{"warn", no_argument, NULL, OPTION_WARN}
,{"multibyte-handling", required_argument, NULL, OPTION_MULTIBYTE_HANDLING}
};
/* Construct the option lists from the standard list and the target
@ -683,6 +685,19 @@ parse_args (int * pargc, char *** pargv)
flag_traditional_format = 1;
break;
case OPTION_MULTIBYTE_HANDLING:
if (strcmp (optarg, "allow") == 0)
multibyte_handling = multibyte_allow;
else if (strcmp (optarg, "warn") == 0)
multibyte_handling = multibyte_warn;
else if (strcmp (optarg, "warn-sym-only") == 0)
multibyte_handling = multibyte_warn_syms;
else if (strcmp (optarg, "warn_sym_only") == 0)
multibyte_handling = multibyte_warn_syms;
else
as_fatal (_("unexpected argument to --multibyte-input-option: '%s'"), optarg);
break;
case OPTION_VERSION:
/* This output is intended to follow the GNU standards document. */
printf (_("GNU assembler %s\n"), BFD_VERSION_STRING);

View File

@ -344,6 +344,14 @@ COMMON int linkrelax;
COMMON int do_not_pad_sections_to_alignment;
enum multibyte_input_handling
{
multibyte_allow = 0,
multibyte_warn,
multibyte_warn_syms
};
COMMON enum multibyte_input_handling multibyte_handling;
/* TRUE if we should produce a listing. */
extern int listing;
@ -450,6 +458,7 @@ void input_scrub_insert_file (char *);
char * input_scrub_new_file (const char *);
char * input_scrub_next_buffer (char **bufp);
size_t do_scrub_chars (size_t (*get) (char *, size_t), char *, size_t);
bool scan_for_multibyte_characters (const unsigned char *, const unsigned char *, bool);
int gen_to_words (LITTLENUM_TYPE *, int, long);
int had_err (void);
int ignore_input (void);

View File

@ -245,6 +245,7 @@ gcc(1), ld(1), and the Info entries for @file{binutils} and @file{ld}.
[@b{--sectname-subst}] [@b{--size-check=[error|warning]}]
[@b{--elf-stt-common=[no|yes]}]
[@b{--generate-missing-build-notes=[no|yes]}]
[@b{--multibyte-handling=[allow|warn|warn-sym-only]}]
[@b{--target-help}] [@var{target-options}]
[@b{--}|@var{files} @dots{}]
@c
@ -871,6 +872,18 @@ Set the maximum width of an input source line, as displayed in a listing, to
Set the maximum number of lines printed in a listing for a single line of input
to @var{number} + 1.
@item --multibyte-handling=allow
@itemx --multibyte-handling=warn
@itemx --multibyte-handling=warn-sym-only
Controls how the assembler handles multibyte characters in the input. The
default (which can be restored by using the @option{allow} argument) is to
allow such characters without complaint. Using the @option{warn} argument will
make the assembler generate a warning message whenever any multibyte character
is encountered. Using the @option{warn-sym-only} argument will only cause a
warning to be generated when a symbol is defined with a name that contains
multibyte characters. (References to undefined symbols will not generate a
warning).
@item --no-pad-sections
Stop the assembler for padding the ends of output sections to the alignment
of that section. The default is to pad the sections, but this can waste space
@ -2966,9 +2979,11 @@ are noted in @ref{Machine Dependencies}.
@end ifset
No symbol may begin with a digit. Case is significant.
There is no length limit; all characters are significant. Multibyte characters
are supported. Symbols are delimited by characters not in that set, or by the
beginning of a file (since the source program must end with a newline, the end
of a file is not a possible symbol delimiter). @xref{Symbols}.
are supported, but note that the setting of the
@option{--multibyte-handling} option might prevent their use. Symbols
are delimited by characters not in that set, or by the beginning of a file
(since the source program must end with a newline, the end of a file is not a
possible symbol delimiter). @xref{Symbols}.
Symbol names may also be enclosed in double quote @code{"} characters. In such
cases any characters are allowed, except for the NUL character. If a double
@ -3858,11 +3873,18 @@ than @code{Foo}.
Symbol names do not start with a digit. An exception to this rule is made for
Local Labels. See below.
Multibyte characters are supported. To generate a symbol name containing
Multibyte characters are supported, but note that the setting of the
@option{multibyte-handling} option might prevent their use.
To generate a symbol name containing
multibyte characters enclose it within double quotes and use escape codes. cf
@xref{Strings}. Generating a multibyte symbol name from a label is not
currently supported.
Since multibyte symbol names are unusual, and could possibly be used
maliciously, @command{@value{AS}} provides a command line option
(@option{--multibyte-handling=warn-sym-only}) which can be used to generate a
warning message whenever a symbol name containing multibyte characters is defined.
Each symbol has exactly one name. Each name in an assembly language program
refers to exactly one symbol. You may use that symbol name any number of times
in a program.

View File

@ -377,6 +377,11 @@ input_scrub_next_buffer (char **bufp)
++p;
}
if (multibyte_handling == multibyte_warn)
(void) scan_for_multibyte_characters ((const unsigned char *) p,
(const unsigned char *) limit,
true /* Generate warnings */);
/* We found a newline in the newly read chars. */
partial_where = p;
partial_size = limit - p;

View File

@ -82,6 +82,10 @@ struct symbol_flags
/* Whether the symbol has been marked to be removed by a .symver
directive. */
unsigned int removed : 1;
/* Set when a warning about the symbol containing multibyte characters
is generated. */
unsigned int multibyte_warned : 1;
};
/* A pointer in the symbol may point to either a complete symbol
@ -198,7 +202,7 @@ static void *
symbol_entry_find (htab_t table, const char *name)
{
hashval_t hash = htab_hash_string (name);
symbol_entry_t needle = { { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
symbol_entry_t needle = { { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
hash, name, 0, 0, 0 } };
return htab_find_with_hash (table, &needle, hash);
}
@ -309,6 +313,18 @@ symbol_init (symbolS *symbolP, const char *name, asection *sec,
symbolP->bsym->name = name;
symbolP->bsym->section = sec;
if (multibyte_handling == multibyte_warn_syms
&& ! symbolP->flags.local_symbol
&& sec != undefined_section
&& ! symbolP->flags.multibyte_warned
&& scan_for_multibyte_characters ((const unsigned char *) name,
(const unsigned char *) name + strlen (name),
false /* Do not warn. */))
{
as_warn (_("symbol '%s' contains multibyte characters"), name);
symbolP->flags.multibyte_warned = 1;
}
S_SET_VALUE (symbolP, valu);
symbol_clear_list_pointers (symbolP);
@ -2427,7 +2443,21 @@ S_SET_SEGMENT (symbolS *s, segT seg)
abort ();
}
else
s->bsym->section = seg;
{
if (multibyte_handling == multibyte_warn_syms
&& ! s->flags.local_symbol
&& seg != undefined_section
&& ! s->flags.multibyte_warned
&& scan_for_multibyte_characters ((const unsigned char *) s->name,
(const unsigned char *) s->name + strlen (s->name),
false))
{
as_warn (_("symbol '%s' contains multibyte characters"), s->name);
s->flags.multibyte_warned = 1;
}
s->bsym->section = seg;
}
}
void

View File

@ -502,3 +502,5 @@ run_dump_test "nop"
run_dump_test "asciz"
run_dump_test "pr27384"
run_dump_test "pr27381"
run_dump_test "multibyte1"
run_dump_test "multibyte2"

View File

@ -0,0 +1,8 @@
.text
.globl heoll
heoll:
.nop
.globl hello
hello:
.nop

View File

@ -0,0 +1,3 @@
#source: multibyte.s
#as: --multibyte-handling=warn
#warning_output: multibyte1.l

View File

@ -0,0 +1,12 @@
[^:]*: Assembler messages:
[^:]*: Warning: multibyte character \(0xe2\) encountered in .*multibyte.s
[^:]*: Warning: multibyte character \(0x80\) encountered in .*multibyte.s
[^:]*: Warning: multibyte character \(0xae\) encountered in .*multibyte.s
[^:]*: Warning: multibyte character \(0xe2\) encountered in .*multibyte.s
[^:]*: Warning: multibyte character \(0x80\) encountered in .*multibyte.s
[^:]*: Warning: multibyte character \(0xac\) encountered in .*multibyte.s
[^:]*: Warning: multibyte character \(0xe2\) encountered in .*multibyte.s
[^:]*: Warning: multibyte character \(0x80\) encountered in .*multibyte.s
[^:]*: Warning: multibyte character \(0xae\) encountered in .*multibyte.s
[^:]*: Warning: multibyte character \(0xe2\) encountered in .*multibyte.s
[^:]*: Warning: further multibyte character warnings suppressed

View File

@ -0,0 +1,3 @@
#source: multibyte.s
#as: --multibyte-handling=warn-sym-only
#warning_output: multibyte2.l

View File

@ -0,0 +1,2 @@
[^:]*: Assembler messages:
[^:]*:3: Warning: symbol '.*' contains multibyte characters