Add --unicode option to control how unicode characters are handled by display tools.

* nm.c: Add --unicode option to control how unicode characters are handled. * objdump.c: Likewise. * readelf.c: Likewise. * strings.c: Likewise. * binutils.texi: Document the new feature. * NEWS: Document the new feature. * testsuite/binutils-all/unicode.exp: New file. * testsuite/binutils-all/nm.hex.unicode * testsuite/binutils-all/strings.escape.unicode * testsuite/binutils-all/objdump.highlight.unicode * testsuite/binutils-all/readelf.invalid.unicode
2025-10-18 05:12:33 +08:00 · 2021-11-09 13:25:42 +00:00
parent 024120b6ee
commit b3aa80b45c
7 changed files with 1411 additions and 105 deletions
--- a/binutils/nm.c
+++ b/binutils/nm.c
@ -38,6 +38,11 @@
 #include "bucomm.h"
 #include "plugin-api.h"
 #include "plugin.h"
+#include "safe-ctype.h"
+
+#ifndef streq
+#define streq(a,b) (strcmp ((a),(b)) == 0)
+#endif

 /* When sorting by size, we use this structure to hold the size and a
   pointer to the minisymbol.  */
@ -216,6 +221,18 @@ static const char *plugin_target = NULL;
 static bfd *lineno_cache_bfd;
 static bfd *lineno_cache_rel_bfd;

+typedef enum unicode_display_type
+{
+  unicode_default = 0,
+  unicode_locale,
+  unicode_escape,
+  unicode_hex,
+  unicode_highlight,
+  unicode_invalid
+} unicode_display_type;
+
+static unicode_display_type unicode_display = unicode_default;
+
 enum long_option_values
 {
  OPTION_TARGET = 200,
@ -260,6 +277,7 @@ static struct option long_options[] =
  {"target", required_argument, 0, OPTION_TARGET},
  {"defined-only", no_argument, &defined_only, 1},
  {"undefined-only", no_argument, &undefined_only, 1},
+  {"unicode", required_argument, NULL, 'U'},
  {"version", no_argument, &show_version, 1},
  {"with-symbol-versions", no_argument, &with_symbol_versions, 1},
  {"without-symbol-versions", no_argument, &with_symbol_versions, 0},
@ -313,6 +331,8 @@ usage (FILE *stream, int status)
  -t, --radix=RADIX      Use RADIX for printing symbol values\n\
      --target=BFDNAME   Specify the target object format as BFDNAME\n\
  -u, --undefined-only   Display only undefined symbols\n\
+  -U {d|s|i|x|e|h}       Specify how to treat UTF-8 encoded unicode characters\n\
+      --unicode={default|show|invalid|hex|escape|highlight}\n\
      --with-symbol-versions  Display version strings after symbol names\n\
  -X 32_64               (ignored)\n\
  @FILE                  Read options from FILE\n\
@ -432,6 +452,187 @@ get_coff_symbol_type (const struct internal_syment *sym)
  return bufp;
 }

+/* Convert a potential UTF-8 encoded sequence in IN into characters in OUT.
+   The conversion format is controlled by the unicode_display variable.
+   Returns the number of characters added to OUT.
+   Returns the number of bytes consumed from IN in CONSUMED.
+   Always consumes at least one byte and displays at least one character.  */
+   
+static unsigned int
+display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
+{
+  char *        orig_out = out;
+  unsigned int  nchars = 0;
+  unsigned int j;
+
+  if (unicode_display == unicode_default)
+    goto invalid;
+
+  if (in[0] < 0xc0)
+    goto invalid;
+
+  if ((in[1] & 0xc0) != 0x80)
+    goto invalid;
+
+  if ((in[0] & 0x20) == 0)
+    {
+      nchars = 2;
+      goto valid;
+    }
+
+  if ((in[2] & 0xc0) != 0x80)
+    goto invalid;
+
+  if ((in[0] & 0x10) == 0)
+    {
+      nchars = 3;
+      goto valid;
+    }
+
+  if ((in[3] & 0xc0) != 0x80)
+    goto invalid;
+
+  nchars = 4;
+
+ valid:
+  switch (unicode_display)
+    {
+    case unicode_locale:
+      /* Copy the bytes into the output buffer as is.  */
+      memcpy (out, in, nchars);
+      out += nchars;
+      break;
+
+    case unicode_invalid:
+    case unicode_hex:
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '<' : '{');
+      out += sprintf (out, "0x");
+      for (j = 0; j < nchars; j++)
+	out += sprintf (out, "%02x", in [j]);
+      out += sprintf (out, "%c", unicode_display == unicode_hex ? '>' : '}');
+      break;
+      
+    case unicode_highlight:
+      if (isatty (1))
+	out += sprintf (out, "\x1B[31;47m"); /* Red.  */
+      /* Fall through.  */
+    case unicode_escape:
+      switch (nchars)
+	{
+	case 2:
+	  out += sprintf (out, "\\u%02x%02x",
+		  ((in[0] & 0x1c) >> 2), 
+		  ((in[0] & 0x03) << 6) | (in[1] & 0x3f));
+	  break;
+
+	case 3:
+	  out += sprintf (out, "\\u%02x%02x",
+		  ((in[0] & 0x0f) << 4) | ((in[1] & 0x3c) >> 2),
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3f)));
+	  break;
+
+	case 4:
+	  out += sprintf (out, "\\u%02x%02x%02x",
+		  ((in[0] & 0x07) << 6) | ((in[1] & 0x3c) >> 2),
+		  ((in[1] & 0x03) << 6) | ((in[2] & 0x3c) >> 2),
+		  ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
+	  break;
+	default:
+	  /* URG.  */
+	  break;
+	}
+
+      if (unicode_display == unicode_highlight && isatty (1))
+	out += sprintf (out, "\033[0m"); /* Default colour.  */
+      break;
+
+    default:
+      /* URG */
+      break;
+    }
+
+  * consumed = nchars;
+  return out - orig_out;
+
+ invalid:
+  /* Not a valid UTF-8 sequence.  */
+  *out = *in;
+  * consumed = 1;
+  return 1;
+}
+
+/* Convert any UTF-8 encoded characters in NAME into the form specified by
+   unicode_display.  Also converts control characters.  Returns a static
+   buffer if conversion was necessary.
+   Code stolen from objdump.c:sanitize_string().  */
+
+static const char *
+convert_utf8 (const char * in)
+{
+  static char *  buffer = NULL;
+  static size_t  buffer_len = 0;
+  const char *   original = in;
+  char *         out;
+
+  /* Paranoia.  */
+  if (in == NULL)
+    return "";
+
+  /* See if any conversion is necessary.
+     In the majority of cases it will not be needed.  */
+  do
+    {
+      unsigned char c = *in++;
+
+      if (c == 0)
+	return original;
+
+      if (ISCNTRL (c))
+	break;
+
+      if (unicode_display != unicode_default && c >= 0xc0)
+	break;
+    }
+  while (1);
+
+  /* Copy the input, translating as needed.  */
+  in = original;
+  if (buffer_len < (strlen (in) * 9))
+    {
+      free ((void *) buffer);
+      buffer_len = strlen (in) * 9;
+      buffer = xmalloc (buffer_len + 1);
+    }
+
+  out = buffer;
+  do
+    {
+      unsigned char c = *in++;
+
+      if (c == 0)
+	break;
+
+      if (ISCNTRL (c))
+	{
+	  *out++ = '^';
+	  *out++ = c + 0x40;
+	}
+      else if (unicode_display != unicode_default && c >= 0xc0)
+	{
+	  unsigned int num_consumed;
+
+	  out += display_utf8 ((const unsigned char *)(in - 1), out, & num_consumed);
+	  in += num_consumed - 1;
+	}
+      else
+	*out++ = c;
+    }
+  while (1);
+
+  *out = 0;
+  return buffer;
+}
+
 /* Print symbol name NAME, read from ABFD, with printf format FORM,
   demangling it if requested.  */

@ -444,6 +645,7 @@ print_symname (const char *form, struct extended_symbol_info *info,

  if (name == NULL)
    name = info->sinfo->name;
+
  if (!with_symbol_versions
      && bfd_get_flavour (abfd) == bfd_target_elf_flavour)
    {
@ -451,6 +653,7 @@ print_symname (const char *form, struct extended_symbol_info *info,
      if (atver)
 	*atver = 0;
    }
+
  if (do_demangle && *name)
    {
      alloc = bfd_demangle (abfd, name, demangle_flags);
@ -458,6 +661,11 @@ print_symname (const char *form, struct extended_symbol_info *info,
 	name = alloc;
    }

+  if (unicode_display != unicode_default)
+    {
+      name = convert_utf8 (name);
+    }
+
  if (info != NULL && info->elfinfo && with_symbol_versions)
    {
      const char *version_string;
@ -1808,7 +2016,7 @@ main (int argc, char **argv)
    fatal (_("fatal error: libbfd ABI mismatch"));
  set_default_bfd_target ();

-  while ((c = getopt_long (argc, argv, "aABCDef:gHhjJlnopPrSst:uvVvX:",
+  while ((c = getopt_long (argc, argv, "aABCDef:gHhjJlnopPrSst:uU:vVvX:",
 			   long_options, (int *) 0)) != EOF)
    {
      switch (c)
@ -1901,6 +2109,24 @@ main (int argc, char **argv)
 	case 'u':
 	  undefined_only = 1;
 	  break;
+
+	case 'U':
+	  if (streq (optarg, "default") || streq (optarg, "d"))
+	    unicode_display = unicode_default;
+	  else if (streq (optarg, "locale") || streq (optarg, "l"))
+	    unicode_display = unicode_locale;
+	  else if (streq (optarg, "escape") || streq (optarg, "e"))
+	    unicode_display = unicode_escape;
+	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
+	    unicode_display = unicode_invalid;
+	  else if (streq (optarg, "hex") || streq (optarg, "x"))
+	    unicode_display = unicode_hex;
+	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
+	    unicode_display = unicode_highlight;
+	  else
+	    fatal (_("invalid argument to -U/--unicode: %s"), optarg);
+	  break;
+
 	case 'V':
 	  show_version = 1;
 	  break;