gdb: disassembler opcode display formatting

This commit changes the format of 'disassemble /r' to match GNU objdump. Specifically, GDB will now display the instruction bytes in as 'objdump --wide --disassemble' does. Here is an example for RISC-V before this patch: (gdb) disassemble /r 0x0001018e,0x0001019e Dump of assembler code from 0x1018e to 0x1019e: 0x0001018e <call_me+66>: 03 26 84 fe lw a2,-24(s0) 0x00010192 <call_me+70>: 83 25 c4 fe lw a1,-20(s0) 0x00010196 <call_me+74>: 61 65 lui a0,0x18 0x00010198 <call_me+76>: 13 05 85 6a addi a0,a0,1704 0x0001019c <call_me+80>: f1 22 jal 0x10368 <printf> End of assembler dump. And here's an example after this patch: (gdb) disassemble /r 0x0001018e,0x0001019e Dump of assembler code from 0x1018e to 0x1019e: 0x0001018e <call_me+66>: fe842603 lw a2,-24(s0) 0x00010192 <call_me+70>: fec42583 lw a1,-20(s0) 0x00010196 <call_me+74>: 6561 lui a0,0x18 0x00010198 <call_me+76>: 6a850513 addi a0,a0,1704 0x0001019c <call_me+80>: 22f1 jal 0x10368 <printf> End of assembler dump. There are two differences here. First, the instruction bytes after the patch are grouped based on the size of the instruction, and are byte-swapped to little-endian order. Second, after the patch, GDB now uses the bytes-per-line hint from libopcodes to add whitespace padding after the opcode bytes, this means that in most cases the instructions are nicely aligned. It is still possible for a very long instruction to intrude into the disassembled text space. The next example is x86-64, before the patch: (gdb) disassemble /r main Dump of assembler code for function main: 0x0000000000401106 <+0>: 55 push %rbp 0x0000000000401107 <+1>: 48 89 e5 mov %rsp,%rbp 0x000000000040110a <+4>: c7 87 d8 00 00 00 01 00 00 00 movl $0x1,0xd8(%rdi) 0x0000000000401114 <+14>: b8 00 00 00 00 mov $0x0,%eax 0x0000000000401119 <+19>: 5d pop %rbp 0x000000000040111a <+20>: c3 ret End of assembler dump. And after the patch: (gdb) disassemble /r main Dump of assembler code for function main: 0x0000000000401106 <+0>: 55 push %rbp 0x0000000000401107 <+1>: 48 89 e5 mov %rsp,%rbp 0x000000000040110a <+4>: c7 87 d8 00 00 00 01 00 00 00 movl $0x1,0xd8(%rdi) 0x0000000000401114 <+14>: b8 00 00 00 00 mov $0x0,%eax 0x0000000000401119 <+19>: 5d pop %rbp 0x000000000040111a <+20>: c3 ret End of assembler dump. Most instructions are aligned, except for the very long instruction. Notice too that for x86-64 libopcodes doesn't request that GDB group the instruction bytes. This matches the behaviour of objdump. In case the user really wants the old behaviour, I have added a new modifier 'disassemble /b', this displays the instruction byte at a time. For x86-64, which never groups instruction bytes, /b and /r are equivalent, but for RISC-V, using /b gets the old layout back (except that the whitespace for alignment is still present). Consider our original RISC-V example, this time using /b: (gdb) disassemble /b 0x0001018e,0x0001019e Dump of assembler code from 0x1018e to 0x1019e: 0x0001018e <call_me+66>: 03 26 84 fe lw a2,-24(s0) 0x00010192 <call_me+70>: 83 25 c4 fe lw a1,-20(s0) 0x00010196 <call_me+74>: 61 65 lui a0,0x18 0x00010198 <call_me+76>: 13 05 85 6a addi a0,a0,1704 0x0001019c <call_me+80>: f1 22 jal 0x10368 <printf> End of assembler dump. Obviously, this patch is a potentially significant change to the behaviour or /r. I could have added /b with the new behaviour and left /r alone. However, personally, I feel the new behaviour is significantly better than the old, hence, I made /r be what I consider the "better" behaviour. The reason I prefer the new behaviour is that, when I use /r, I almost always want to manually decode the instruction for some reason, and having the bytes displayed in "instruction order" rather than memory order, just makes this easier. The 'record instruction-history' command also takes a /r modifier, and has been modified in the same way as disassemble; /r gets the new behaviour, and /b has been added to retain the old behaviour. Finally, the MI command -data-disassemble, is unchanged in behaviour, this command now requests the raw bytes of the instruction, which is equivalent to the /b modifier. This means that the MI output will remain backward compatible.
2025-06-17 16:05:56 +08:00 · 2022-06-21 20:23:35 +01:00
parent d309a8f9b3
commit d4ce49b7ac
8 changed files with 109 additions and 16 deletions
--- a/gdb/NEWS
+++ b/gdb/NEWS
@ -59,6 +59,18 @@
 * gdb now supports zstd compressed debug sections (ELFCOMPRESS_ZSTD) for ELF.
 * The format of 'disassemble /r' and 'record instruction-history /r'
  has changed.  The instruction bytes could now be grouped together,
  and displayed in the endianness of the instruction.  This is the
  same layout as used by GNU objdump when disassembling.
  There is now 'disassemble /b' and 'record instruction-history /b'
  which will always display the instructions bytes one at a time in
  memory order, that is, the byte at the lowest address first.
  For both /r and /b GDB is now better at using whitespace in order to
  align the disassembled instruction text.
 * New commands
 maintenance set ignore-prologue-end-flag on|off
--- a/gdb/cli/cli-cmds.c
+++ b/gdb/cli/cli-cmds.c
@ -1508,6 +1508,9 @@ disassemble_current_function (gdb_disassembly_flags flags)
   A /r modifier will include raw instructions in hex with the assembly.
   A /b modifier is similar to /r except the instruction bytes are printed
   as separate bytes with no grouping, or endian switching.
   A /s modifier will include source code with the assembly, like /m, with
   two important differences:
   1) The output is still in pc address order.
@ -1546,6 +1549,9 @@ disassemble_command (const char *arg, int from_tty)
 	    case 'r':
 	      flags |= DISASSEMBLY_RAW_INSN;
 	      break;
 	    case 'b':
 	      flags |= DISASSEMBLY_RAW_BYTES;
 	      break;
 	    case 's':
 	      flags |= DISASSEMBLY_SOURCE;
 	      break;
--- a/gdb/disasm-flags.h
+++ b/gdb/disasm-flags.h
@ -33,6 +33,7 @@ enum gdb_disassembly_flag
    DISASSEMBLY_OMIT_PC = (0x1 << 4),
    DISASSEMBLY_SOURCE = (0x1 << 5),
    DISASSEMBLY_SPECULATIVE = (0x1 << 6),
    DISASSEMBLY_RAW_BYTES = (0x1 << 7),
  };
 DEF_ENUM_FLAGS_TYPE (enum gdb_disassembly_flag, gdb_disassembly_flags);
--- a/gdb/disasm.c
+++ b/gdb/disasm.c
@ -457,7 +457,7 @@ gdb_pretty_print_disassembler::pretty_print_insn (const struct disasm_insn *insn
 	throw ex;
      }
-    if (flags & DISASSEMBLY_RAW_INSN)
+    if ((flags & (DISASSEMBLY_RAW_INSN | DISASSEMBLY_RAW_BYTES)) != 0)
      {
 	/* Build the opcodes using a temporary stream so we can
 	   write them out in a single go for the MI.  */
@ -467,14 +467,51 @@ gdb_pretty_print_disassembler::pretty_print_insn (const struct disasm_insn *insn
 	m_opcode_data.resize (size);
 	read_code (pc, m_opcode_data.data (), size);
-	for (int i = 0; i < size; ++i)
+	/* The disassembler provides information about the best way to
 	   display the instruction bytes to the user.  We provide some sane
 	   defaults in case the disassembler gets it wrong.  */
 	const struct disassemble_info *di = m_di.disasm_info ();
 	int bytes_per_line = std::max (di->bytes_per_line, size);
 	int bytes_per_chunk = std::max (di->bytes_per_chunk, 1);
 	/* If the user has requested the instruction bytes be displayed
 	   byte at a time, then handle that here.  Also, if the instruction
 	   is not a multiple of the chunk size (which probably indicates a
 	   disassembler problem) then avoid that causing display problems
 	   by switching to byte at a time mode.  */
 	if ((flags & DISASSEMBLY_RAW_BYTES) != 0
 	    || (size % bytes_per_chunk) != 0)
 	  bytes_per_chunk = 1;
 	/* Print the instruction opcodes bytes, grouped into chunks.  */
 	for (int i = 0; i < size; i += bytes_per_chunk)
 	  {
 	    if (i > 0)
 	      m_opcode_stb.puts (" ");
-	    m_opcode_stb.printf ("%02x", (unsigned) m_opcode_data[i]);
+
 	    if (di->display_endian == BFD_ENDIAN_LITTLE)
 	      {
 		for (int k = bytes_per_chunk; k-- != 0; )
 		  m_opcode_stb.printf ("%02x", (unsigned) m_opcode_data[i + k]);
 	      }
 	    else
 	      {
 		for (int k = 0; k < bytes_per_chunk; k++)
 		  m_opcode_stb.printf ("%02x", (unsigned) m_opcode_data[i + k]);
 	      }
 	  }
 	/* Calculate required padding.  */
 	int nspaces = 0;
 	for (int i = size; i < bytes_per_line; i += bytes_per_chunk)
 	  {
 	    if (i > size)
 	      nspaces++;
 	    nspaces += bytes_per_chunk * 2;
 	  }
 	m_uiout->field_stream ("opcodes", m_opcode_stb);
 	m_uiout->spaces (nspaces);
 	m_uiout->text ("\t");
      }
--- a/gdb/doc/gdb.texinfo
+++ b/gdb/doc/gdb.texinfo
@ -7945,7 +7945,10 @@ are printed in execution order.
 It can also print mixed source+disassembly if you specify the the
@code{/m} or @code{/s} modifier, and print the raw instructions in hex
-as well as in symbolic form by specifying the @code{/r} modifier.
+as well as in symbolic form by specifying the @code{/r} or @code{/b}
 modifier.  The behaviour of the @code{/m}, @code{/s}, @code{/r}, and
@code{/b} modifiers are the same as for the @kbd{disassemble} command
 (@pxref{disassemble,,@kbd{disassemble}}).
 The current position marker is printed for the instruction at the
 current program counter value.  This instruction can appear multiple
@ -9859,6 +9862,7 @@ After @code{info line}, using @code{info line} again without
 specifying a location will display information about the next source
 line.
@anchor{disassemble}
@table @code
@kindex disassemble
@cindex assembly instructions
@ -9869,16 +9873,17 @@ line.
@itemx disassemble /m
@itemx disassemble /s
@itemx disassemble /r
@itemx disassemble /b
 This specialized command dumps a range of memory as machine
 instructions.  It can also print mixed source+disassembly by specifying
-the @code{/m} or @code{/s} modifier and print the raw instructions in hex
+the @code{/m} or @code{/s} modifier and print the raw instructions in
-as well as in symbolic form by specifying the @code{/r} modifier.
+hex as well as in symbolic form by specifying the @code{/r} or @code{/b}
-The default memory range is the function surrounding the
+modifier.  The default memory range is the function surrounding the
 program counter of the selected frame.  A single argument to this
 command is a program counter value; @value{GDBN} dumps the function
-surrounding this value.  When two arguments are given, they should
+surrounding this value.  When two arguments are given, they should be
-be separated by a comma, possibly surrounded by whitespace.  The
+separated by a comma, possibly surrounded by whitespace.  The arguments
-arguments specify a range of addresses to dump, in one of two forms:
+specify a range of addresses to dump, in one of two forms:
@table @code
@item @var{start},@var{end}
@ -9916,6 +9921,35 @@ Dump of assembler code from 0x32c4 to 0x32e4:
 End of assembler dump.
@end smallexample
 The following two examples are for RISC-V, and demonstrates the
 difference between the @code{/r} and @code{/b} modifiers.  First with
@code{/b}, the bytes of the instruction are printed, in hex, in memory
 order:
@smallexample
 (@value{GDBP}) disassemble /b 0x00010150,0x0001015c
 Dump of assembler code from 0x10150 to 0x1015c:
   0x00010150 <call_me+4>:      22 dc                 	sw	s0,56(sp)
   0x00010152 <call_me+6>:      80 00                 	addi	s0,sp,64
   0x00010154 <call_me+8>:      23 26 a4 fe           	sw	a0,-20(s0)
   0x00010158 <call_me+12>:     23 24 b4 fe           	sw	a1,-24(s0)
 End of assembler dump.
@end smallexample
 In contrast, with @code{/r} the bytes of the instruction are displayed
 in the instruction order, for RISC-V this means that the bytes have been
 swapped to little-endian order:
@smallexample
 (@value{GDBP}) disassemble /r 0x00010150,0x0001015c
 Dump of assembler code from 0x10150 to 0x1015c:
   0x00010150 <call_me+4>:      dc22              	sw	s0,56(sp)
   0x00010152 <call_me+6>:      0080              	addi	s0,sp,64
   0x00010154 <call_me+8>:      fea42623        	sw	a0,-20(s0)
   0x00010158 <call_me+12>:     feb42423        	sw	a1,-24(s0)
 End of assembler dump.
@end smallexample
 Here is an example showing mixed source+assembly for Intel x86
 with @code{/m} or @code{/s}, when the program is stopped just after
 function prologue in a non-optimized function with no inline code.
--- a/gdb/mi/mi-cmd-disas.c
+++ b/gdb/mi/mi-cmd-disas.c
@ -165,16 +165,16 @@ mi_cmd_disassemble (const char *command, char **argv, int argc)
      disasm_flags |= DISASSEMBLY_SOURCE_DEPRECATED;
      break;
    case 2:
-      disasm_flags |= DISASSEMBLY_RAW_INSN;
+      disasm_flags |= DISASSEMBLY_RAW_BYTES;
      break;
    case 3:
-      disasm_flags |= DISASSEMBLY_SOURCE_DEPRECATED | DISASSEMBLY_RAW_INSN;
+      disasm_flags |= DISASSEMBLY_SOURCE_DEPRECATED | DISASSEMBLY_RAW_BYTES;
      break;
    case 4:
      disasm_flags |= DISASSEMBLY_SOURCE;
      break;
    case 5:
-      disasm_flags |= DISASSEMBLY_SOURCE | DISASSEMBLY_RAW_INSN;
+      disasm_flags |= DISASSEMBLY_SOURCE | DISASSEMBLY_RAW_BYTES;
      break;
    default:
      gdb_assert_not_reached ("bad disassembly mode");
--- a/gdb/record.c
+++ b/gdb/record.c
@ -494,6 +494,9 @@ get_insn_history_modifiers (const char **arg)
 	    case 'r':
 	      modifiers |= DISASSEMBLY_RAW_INSN;
 	      break;
 	    case 'b':
 	      modifiers |= DISASSEMBLY_RAW_BYTES;
 	      break;
 	    case 'f':
 	      modifiers |= DISASSEMBLY_OMIT_FNAME;
 	      break;
--- a/gdb/testsuite/gdb.mi/mi-disassemble.exp
+++ b/gdb/testsuite/gdb.mi/mi-disassemble.exp
@ -245,12 +245,12 @@ proc test_disassembly_opcode_format {} {
    # then disassemble using the MI command.
    set longest_insn_bytes ""
    set longest_insn_addr ""
-    gdb_test_multiple "disassemble /r main" "" {
+    gdb_test_multiple "disassemble /b main" "" {
-	-re "^disassemble /r main\r\n" {
+	-re "^disassemble /b main\r\n" {
 	    exp_continue
 	}
-	-re "^&\"disassemble /r main.n\"\r\n" {
+	-re "^&\"disassemble /b main.n\"\r\n" {
 	    exp_continue
 	}