1#ifndef LIBDISASM_H
2#define LIBDISASM_H
3
4#include <stdint.h>
5
6/* 'NEW" types
7 * __________________________________________________________________________*/
8#ifndef LIBDISASM_QWORD_H /* do not interfere with qword.h */
9 #define LIBDISASM_QWORD_H
10 #ifdef _MSC_VER
11 typedef __int64 qword_t;
12 #else
13 typedef int64_t qword_t;
14 #endif
15#endif
16
17#include <sys/types.h>
18
19#ifdef __cplusplus
20extern "C" {
21#endif
22
23/* 'NEW" x86 API
24 * __________________________________________________________________________*/
25
26
27/* ========================================= Error Reporting */
28/* REPORT CODES
29 * These are passed to a reporter function passed at initialization.
30 * Each code determines the type of the argument passed to the reporter;
31 * this allows the report to recover from errors, or just log them.
32 */
33enum x86_report_codes {
34 report_disasm_bounds, /* RVA OUT OF BOUNDS : The disassembler could
35 not disassemble the supplied RVA as it is
36 out of the range of the buffer. The
37 application should store the address and
38 attempt to determine what section of the
39 binary it is in, then disassemble the
40 address from the bytes in that section.
41 data: uint32_t rva */
42 report_insn_bounds, /* INSTRUCTION OUT OF BOUNDS: The disassembler
43 could not disassemble the instruction as
44 the instruction would require bytes beyond
45 the end of the current buffer. This usually
46 indicated garbage bytes at the end of a
47 buffer, or an incorrectly-sized buffer.
48 data: uint32_t rva */
49 report_invalid_insn, /* INVALID INSTRUCTION: The disassembler could
50 not disassemble the instruction as it has an
51 invalid combination of opcodes and operands.
52 This will stop automated disassembly; the
53 application can restart the disassembly
54 after the invalid instruction.
55 data: uint32_t rva */
56 report_unknown
57};
58
59/* 'arg' is optional arbitrary data provided by the code passing the
60 * callback -- for example, it could be 'this' or 'self' in OOP code.
61 * 'code' is provided by libdisasm, it is one of the above
62 * 'data' is provided by libdisasm and is context-specific, per the enums */
63typedef void (*DISASM_REPORTER)( enum x86_report_codes code,
64 void *data, void *arg );
65
66
67/* x86_report_error : Call the register reporter to report an error */
68void x86_report_error( enum x86_report_codes code, void *data );
69
70/* ========================================= Libdisasm Management Routines */
71enum x86_options { /* these can be ORed together */
72 opt_none= 0,
73 opt_ignore_nulls=1, /* ignore sequences of > 4 NULL bytes */
74 opt_16_bit=2, /* 16-bit/DOS disassembly */
75 opt_att_mnemonics=4, /* use AT&T syntax names for alternate opcode mnemonics */
76};
77
78/* management routines */
79/* 'arg' is caller-specific data which is passed as the first argument
80 * to the reporter callback routine */
81int x86_init( enum x86_options options, DISASM_REPORTER reporter, void *arg);
82void x86_set_reporter( DISASM_REPORTER reporter, void *arg);
83void x86_set_options( enum x86_options options );
84enum x86_options x86_get_options( void );
85int x86_cleanup(void);
86
87
88/* ========================================= Instruction Representation */
89/* these defines are only intended for use in the array decl's */
90#define MAX_REGNAME 8
91
92#define MAX_PREFIX_STR 32
93#define MAX_MNEM_STR 16
94#define MAX_INSN_SIZE 20 /* same as in i386.h */
95#define MAX_OP_STRING 32 /* max possible operand size in string form */
96#define MAX_OP_RAW_STRING 64 /* max possible operand size in raw form */
97#define MAX_OP_XML_STRING 256 /* max possible operand size in xml form */
98#define MAX_NUM_OPERANDS 8 /* max # implicit and explicit operands */
99/* in these, the '2 *' is arbitrary: the max # of operands should require
100 * more space than the rest of the insn */
101#define MAX_INSN_STRING 512 /* 2 * 8 * MAX_OP_STRING */
102#define MAX_INSN_RAW_STRING 1024 /* 2 * 8 * MAX_OP_RAW_STRING */
103#define MAX_INSN_XML_STRING 4096 /* 2 * 8 * MAX_OP_XML_STRING */
104
105enum x86_reg_type { /* NOTE: these may be ORed together */
106 reg_gen = 0x00001, /* general purpose */
107 reg_in = 0x00002, /* incoming args, ala RISC */
108 reg_out = 0x00004, /* args to calls, ala RISC */
109 reg_local = 0x00008, /* local vars, ala RISC */
110 reg_fpu = 0x00010, /* FPU data register */
111 reg_seg = 0x00020, /* segment register */
112 reg_simd = 0x00040, /* SIMD/MMX reg */
113 reg_sys = 0x00080, /* restricted/system register */
114 reg_sp = 0x00100, /* stack pointer */
115 reg_fp = 0x00200, /* frame pointer */
116 reg_pc = 0x00400, /* program counter */
117 reg_retaddr = 0x00800, /* return addr for func */
118 reg_cond = 0x01000, /* condition code / flags */
119 reg_zero = 0x02000, /* zero register, ala RISC */
120 reg_ret = 0x04000, /* return value */
121 reg_src = 0x10000, /* array/rep source */
122 reg_dest = 0x20000, /* array/rep destination */
123 reg_count = 0x40000 /* array/rep/loop counter */
124};
125
126/* x86_reg_t : an X86 CPU register */
127typedef struct {
128 char name[MAX_REGNAME];
129 enum x86_reg_type type; /* what register is used for */
130 unsigned int size; /* size of register in bytes */
131 unsigned int id; /* register ID #, for quick compares */
132 unsigned int alias; /* ID of reg this is an alias for */
133 unsigned int shift; /* amount to shift aliased reg by */
134} x86_reg_t;
135
136/* x86_ea_t : an X86 effective address (address expression) */
137typedef struct {
138 unsigned int scale; /* scale factor */
139 x86_reg_t index, base; /* index, base registers */
140 int32_t disp; /* displacement */
141 char disp_sign; /* is negative? 1/0 */
142 char disp_size; /* 0, 1, 2, 4 */
143} x86_ea_t;
144
145/* x86_absolute_t : an X86 segment:offset address (descriptor) */
146typedef struct {
147 unsigned short segment; /* loaded directly into CS */
148 union {
149 unsigned short off16; /* loaded directly into IP */
150 uint32_t off32; /* loaded directly into EIP */
151 } offset;
152} x86_absolute_t;
153
154enum x86_op_type { /* mutually exclusive */
155 op_unused = 0, /* empty/unused operand: should never occur */
156 op_register = 1, /* CPU register */
157 op_immediate = 2, /* Immediate Value */
158 op_relative_near = 3, /* Relative offset from IP */
159 op_relative_far = 4, /* Relative offset from IP */
160 op_absolute = 5, /* Absolute address (ptr16:32) */
161 op_expression = 6, /* Address expression (scale/index/base/disp) */
162 op_offset = 7, /* Offset from start of segment (m32) */
163 op_unknown
164};
165
166#define x86_optype_is_address( optype ) \
167 ( optype == op_absolute || optype == op_offset )
168#define x86_optype_is_relative( optype ) \
169 ( optype == op_relative_near || optype == op_relative_far )
170#define x86_optype_is_memory( optype ) \
171 ( optype > op_immediate && optype < op_unknown )
172
173enum x86_op_datatype { /* these use Intel's lame terminology */
174 op_byte = 1, /* 1 byte integer */
175 op_word = 2, /* 2 byte integer */
176 op_dword = 3, /* 4 byte integer */
177 op_qword = 4, /* 8 byte integer */
178 op_dqword = 5, /* 16 byte integer */
179 op_sreal = 6, /* 4 byte real (single real) */
180 op_dreal = 7, /* 8 byte real (double real) */
181 op_extreal = 8, /* 10 byte real (extended real) */
182 op_bcd = 9, /* 10 byte binary-coded decimal */
183 op_ssimd = 10, /* 16 byte : 4 packed single FP (SIMD, MMX) */
184 op_dsimd = 11, /* 16 byte : 2 packed double FP (SIMD, MMX) */
185 op_sssimd = 12, /* 4 byte : scalar single FP (SIMD, MMX) */
186 op_sdsimd = 13, /* 8 byte : scalar double FP (SIMD, MMX) */
187 op_descr32 = 14, /* 6 byte Intel descriptor 2:4 */
188 op_descr16 = 15, /* 4 byte Intel descriptor 2:2 */
189 op_pdescr32 = 16, /* 6 byte Intel pseudo-descriptor 32:16 */
190 op_pdescr16 = 17, /* 6 byte Intel pseudo-descriptor 8:24:16 */
191 op_bounds16 = 18, /* signed 16:16 lower:upper bounds */
192 op_bounds32 = 19, /* signed 32:32 lower:upper bounds */
193 op_fpuenv16 = 20, /* 14 byte FPU control/environment data */
194 op_fpuenv32 = 21, /* 28 byte FPU control/environment data */
195 op_fpustate16 = 22, /* 94 byte FPU state (env & reg stack) */
196 op_fpustate32 = 23, /* 108 byte FPU state (env & reg stack) */
197 op_fpregset = 24, /* 512 bytes: register set */
198 op_fpreg = 25, /* FPU register */
199 op_none = 0xFF, /* operand without a datatype (INVLPG) */
200};
201
202enum x86_op_access { /* ORed together */
203 op_read = 1,
204 op_write = 2,
205 op_execute = 4
206};
207
208enum x86_op_flags { /* ORed together, but segs are mutually exclusive */
209 op_signed = 1, /* signed integer */
210 op_string = 2, /* possible string or array */
211 op_constant = 4, /* symbolic constant */
212 op_pointer = 8, /* operand points to a memory address */
213 op_sysref = 0x010, /* operand is a syscall number */
214 op_implied = 0x020, /* operand is implicit in the insn */
215 op_hardcode = 0x40, /* operand is hardcoded in insn definition */
216 /* NOTE: an 'implied' operand is one which can be considered a side
217 * effect of the insn, e.g. %esp being modified by PUSH or POP. A
218 * 'hard-coded' operand is one which is specified in the instruction
219 * definition, e.g. %es:%edi in MOVSB or 1 in ROL Eb, 1. The difference
220 * is that hard-coded operands are printed by disassemblers and are
221 * required to re-assemble, while implicit operands are invisible. */
222 op_es_seg = 0x100, /* ES segment override */
223 op_cs_seg = 0x200, /* CS segment override */
224 op_ss_seg = 0x300, /* SS segment override */
225 op_ds_seg = 0x400, /* DS segment override */
226 op_fs_seg = 0x500, /* FS segment override */
227 op_gs_seg = 0x600 /* GS segment override */
228};
229
230/* x86_op_t : an X86 instruction operand */
231typedef struct {
232 enum x86_op_type type; /* operand type */
233 enum x86_op_datatype datatype; /* operand size */
234 enum x86_op_access access; /* operand access [RWX] */
235 enum x86_op_flags flags; /* misc flags */
236 union {
237 /* sizeof will have to work on these union members! */
238 /* immediate values */
239 char sbyte;
240 short sword;
241 int32_t sdword;
242 qword_t sqword;
243 unsigned char byte;
244 unsigned short word;
245 uint32_t dword;
246 qword_t qword;
247 float sreal;
248 double dreal;
249 /* misc large/non-native types */
250 unsigned char extreal[10];
251 unsigned char bcd[10];
252 qword_t dqword[2];
253 unsigned char simd[16];
254 unsigned char fpuenv[28];
255 /* offset from segment */
256 uint32_t offset;
257 /* ID of CPU register */
258 x86_reg_t reg;
259 /* offsets from current insn */
260 char relative_near;
261 int32_t relative_far;
262 /* segment:offset */
263 x86_absolute_t absolute;
264 /* effective address [expression] */
265 x86_ea_t expression;
266 } data;
267 /* this is needed to make formatting operands more sane */
268 void * insn; /* pointer to x86_insn_t owning operand */
269} x86_op_t;
270
271/* Linked list of x86_op_t; provided for manual traversal of the operand
272 * list in an insn. Users wishing to add operands to this list, e.g. to add
273 * implicit operands, should use x86_operand_new in x86_operand_list.h */
274typedef struct x86_operand_list {
275 x86_op_t op;
276 struct x86_operand_list *next;
277} x86_oplist_t;
278
279enum x86_insn_group {
280 insn_none = 0, /* invalid instruction */
281 insn_controlflow = 1,
282 insn_arithmetic = 2,
283 insn_logic = 3,
284 insn_stack = 4,
285 insn_comparison = 5,
286 insn_move = 6,
287 insn_string = 7,
288 insn_bit_manip = 8,
289 insn_flag_manip = 9,
290 insn_fpu = 10,
291 insn_interrupt = 13,
292 insn_system = 14,
293 insn_other = 15
294};
295
296enum x86_insn_type {
297 insn_invalid = 0, /* invalid instruction */
298 /* insn_controlflow */
299 insn_jmp = 0x1001,
300 insn_jcc = 0x1002,
301 insn_call = 0x1003,
302 insn_callcc = 0x1004,
303 insn_return = 0x1005,
304 /* insn_arithmetic */
305 insn_add = 0x2001,
306 insn_sub = 0x2002,
307 insn_mul = 0x2003,
308 insn_div = 0x2004,
309 insn_inc = 0x2005,
310 insn_dec = 0x2006,
311 insn_shl = 0x2007,
312 insn_shr = 0x2008,
313 insn_rol = 0x2009,
314 insn_ror = 0x200A,
315 /* insn_logic */
316 insn_and = 0x3001,
317 insn_or = 0x3002,
318 insn_xor = 0x3003,
319 insn_not = 0x3004,
320 insn_neg = 0x3005,
321 /* insn_stack */
322 insn_push = 0x4001,
323 insn_pop = 0x4002,
324 insn_pushregs = 0x4003,
325 insn_popregs = 0x4004,
326 insn_pushflags = 0x4005,
327 insn_popflags = 0x4006,
328 insn_enter = 0x4007,
329 insn_leave = 0x4008,
330 /* insn_comparison */
331 insn_test = 0x5001,
332 insn_cmp = 0x5002,
333 /* insn_move */
334 insn_mov = 0x6001, /* move */
335 insn_movcc = 0x6002, /* conditional move */
336 insn_xchg = 0x6003, /* exchange */
337 insn_xchgcc = 0x6004, /* conditional exchange */
338 /* insn_string */
339 insn_strcmp = 0x7001,
340 insn_strload = 0x7002,
341 insn_strmov = 0x7003,
342 insn_strstore = 0x7004,
343 insn_translate = 0x7005, /* xlat */
344 /* insn_bit_manip */
345 insn_bittest = 0x8001,
346 insn_bitset = 0x8002,
347 insn_bitclear = 0x8003,
348 /* insn_flag_manip */
349 insn_clear_carry = 0x9001,
350 insn_clear_zero = 0x9002,
351 insn_clear_oflow = 0x9003,
352 insn_clear_dir = 0x9004,
353 insn_clear_sign = 0x9005,
354 insn_clear_parity = 0x9006,
355 insn_set_carry = 0x9007,
356 insn_set_zero = 0x9008,
357 insn_set_oflow = 0x9009,
358 insn_set_dir = 0x900A,
359 insn_set_sign = 0x900B,
360 insn_set_parity = 0x900C,
361 insn_tog_carry = 0x9010,
362 insn_tog_zero = 0x9020,
363 insn_tog_oflow = 0x9030,
364 insn_tog_dir = 0x9040,
365 insn_tog_sign = 0x9050,
366 insn_tog_parity = 0x9060,
367 /* insn_fpu */
368 insn_fmov = 0xA001,
369 insn_fmovcc = 0xA002,
370 insn_fneg = 0xA003,
371 insn_fabs = 0xA004,
372 insn_fadd = 0xA005,
373 insn_fsub = 0xA006,
374 insn_fmul = 0xA007,
375 insn_fdiv = 0xA008,
376 insn_fsqrt = 0xA009,
377 insn_fcmp = 0xA00A,
378 insn_fcos = 0xA00C,
379 insn_fldpi = 0xA00D,
380 insn_fldz = 0xA00E,
381 insn_ftan = 0xA00F,
382 insn_fsine = 0xA010,
383 insn_fsys = 0xA020,
384 /* insn_interrupt */
385 insn_int = 0xD001,
386 insn_intcc = 0xD002, /* not present in x86 ISA */
387 insn_iret = 0xD003,
388 insn_bound = 0xD004,
389 insn_debug = 0xD005,
390 insn_trace = 0xD006,
391 insn_invalid_op = 0xD007,
392 insn_oflow = 0xD008,
393 /* insn_system */
394 insn_halt = 0xE001,
395 insn_in = 0xE002, /* input from port/bus */
396 insn_out = 0xE003, /* output to port/bus */
397 insn_cpuid = 0xE004,
398 /* insn_other */
399 insn_nop = 0xF001,
400 insn_bcdconv = 0xF002, /* convert to or from BCD */
401 insn_szconv = 0xF003 /* change size of operand */
402};
403
404/* These flags specify special characteristics of the instruction, such as
405 * whether the inatruction is privileged or whether it serializes the
406 * pipeline.
407 * NOTE : These may not be accurate for all instructions; updates to the
408 * opcode tables have not been completed. */
409enum x86_insn_note {
410 insn_note_ring0 = 1, /* Only available in ring 0 */
411 insn_note_smm = 2, /* "" in System Management Mode */
412 insn_note_serial = 4, /* Serializing instruction */
413 insn_note_nonswap = 8, /* Does not swap arguments in att-style formatting */
414 insn_note_nosuffix = 16, /* Does not have size suffix in att-style formatting */
415};
416
417/* This specifies what effects the instruction has on the %eflags register */
418enum x86_flag_status {
419 insn_carry_set = 0x1, /* CF */
420 insn_zero_set = 0x2, /* ZF */
421 insn_oflow_set = 0x4, /* OF */
422 insn_dir_set = 0x8, /* DF */
423 insn_sign_set = 0x10, /* SF */
424 insn_parity_set = 0x20, /* PF */
425 insn_carry_or_zero_set = 0x40,
426 insn_zero_set_or_sign_ne_oflow = 0x80,
427 insn_carry_clear = 0x100,
428 insn_zero_clear = 0x200,
429 insn_oflow_clear = 0x400,
430 insn_dir_clear = 0x800,
431 insn_sign_clear = 0x1000,
432 insn_parity_clear = 0x2000,
433 insn_sign_eq_oflow = 0x4000,
434 insn_sign_ne_oflow = 0x8000
435};
436
437/* The CPU model in which the insturction first appeared; this can be used
438 * to mask out instructions appearing in earlier or later models or to
439 * check the portability of a binary.
440 * NOTE : These may not be accurate for all instructions; updates to the
441 * opcode tables have not been completed. */
442enum x86_insn_cpu {
443 cpu_8086 = 1, /* Intel */
444 cpu_80286 = 2,
445 cpu_80386 = 3,
446 cpu_80387 = 4,
447 cpu_80486 = 5,
448 cpu_pentium = 6,
449 cpu_pentiumpro = 7,
450 cpu_pentium2 = 8,
451 cpu_pentium3 = 9,
452 cpu_pentium4 = 10,
453 cpu_k6 = 16, /* AMD */
454 cpu_k7 = 32,
455 cpu_athlon = 48
456};
457
458/* CPU ISA subsets: These are derived from the Instruction Groups in
459 * Intel Vol 1 Chapter 5; they represent subsets of the IA32 ISA but
460 * do not reflect the 'type' of the instruction in the same way that
461 * x86_insn_group does. In short, these are AMD/Intel's somewhat useless
462 * designations.
463 * NOTE : These may not be accurate for all instructions; updates to the
464 * opcode tables have not been completed. */
465enum x86_insn_isa {
466 isa_gp = 1, /* general purpose */
467 isa_fp = 2, /* floating point */
468 isa_fpumgt = 3, /* FPU/SIMD management */
469 isa_mmx = 4, /* Intel MMX */
470 isa_sse1 = 5, /* Intel SSE SIMD */
471 isa_sse2 = 6, /* Intel SSE2 SIMD */
472 isa_sse3 = 7, /* Intel SSE3 SIMD */
473 isa_3dnow = 8, /* AMD 3DNow! SIMD */
474 isa_sys = 9 /* system instructions */
475};
476
477enum x86_insn_prefix {
478 insn_no_prefix = 0,
479 insn_rep_zero = 1, /* REPZ and REPE */
480 insn_rep_notzero = 2, /* REPNZ and REPNZ */
481 insn_lock = 4 /* LOCK: */
482};
483
484/* TODO: maybe provide insn_new/free(), and have disasm return new insn_t */
485/* x86_insn_t : an X86 instruction */
486typedef struct {
487 /* information about the instruction */
488 uint32_t addr; /* load address */
489 uint32_t offset; /* offset into file/buffer */
490 enum x86_insn_group group; /* meta-type, e.g. INS_EXEC */
491 enum x86_insn_type type; /* type, e.g. INS_BRANCH */
492 enum x86_insn_note note; /* note, e.g. RING0 */
493 unsigned char bytes[MAX_INSN_SIZE];
494 unsigned char size; /* size of insn in bytes */
495 /* 16/32-bit mode settings */
496 unsigned char addr_size; /* default address size : 2 or 4 */
497 unsigned char op_size; /* default operand size : 2 or 4 */
498 /* CPU/instruction set */
499 enum x86_insn_cpu cpu;
500 enum x86_insn_isa isa;
501 /* flags */
502 enum x86_flag_status flags_set; /* flags set or tested by insn */
503 enum x86_flag_status flags_tested;
504 /* stack */
505 unsigned char stack_mod; /* 0 or 1 : is the stack modified? */
506 int32_t stack_mod_val; /* val stack is modified by if known */
507
508 /* the instruction proper */
509 enum x86_insn_prefix prefix; /* prefixes ORed together */
510 char prefix_string[MAX_PREFIX_STR]; /* prefixes [might be truncated] */
511 char mnemonic[MAX_MNEM_STR];
512 x86_oplist_t *operands; /* list of explicit/implicit operands */
513 size_t operand_count; /* total number of operands */
514 size_t explicit_count; /* number of explicit operands */
515 /* convenience fields for user */
516 void *block; /* code block containing this insn */
517 void *function; /* function containing this insn */
518 int tag; /* tag the insn as seen/processed */
519} x86_insn_t;
520
521
522/* returns 0 if an instruction is invalid, 1 if valid */
523int x86_insn_is_valid( x86_insn_t *insn );
524
525/* DISASSEMBLY ROUTINES
526 * Canonical order of arguments is
527 * (buf, buf_len, buf_rva, offset, len, insn, func, arg, resolve_func)
528 * ...but of course all of these are not used at the same time.
529 */
530
531
532/* Function prototype for caller-supplied callback routine
533 * These callbacks are intended to process 'insn' further, e.g. by
534 * adding it to a linked list, database, etc */
535typedef void (*DISASM_CALLBACK)( x86_insn_t *insn, void * arg );
536
537/* Function prototype for caller-supplied address resolver.
538 * This routine is used to determine the rva to disassemble next, given
539 * the 'dest' operand of a jump/call. This allows the caller to resolve
540 * jump/call targets stored in a register or on the stack, and also allows
541 * the caller to prevent endless loops by checking if an address has
542 * already been disassembled. If an address cannot be resolved from the
543 * operand, or if the address has already been disassembled, this routine
544 * should return -1; in all other cases the RVA to be disassembled next
545 * should be returned. */
546typedef int32_t (*DISASM_RESOLVER)( x86_op_t *op, x86_insn_t * current_insn,
547 void *arg );
548
549
550/* x86_disasm: Disassemble a single instruction from a buffer of bytes.
551 * Returns size of instruction in bytes.
552 * Caller is responsible for calling x86_oplist_free() on
553 * a reused "insn" to avoid leaking memory when calling this
554 * function repeatedly.
555 * buf : Buffer of bytes to disassemble
556 * buf_len : Length of the buffer
557 * buf_rva : Load address of the start of the buffer
558 * offset : Offset in buffer to disassemble
559 * insn : Structure to fill with disassembled instruction
560 */
561unsigned int x86_disasm( unsigned char *buf, unsigned int buf_len,
562 uint32_t buf_rva, unsigned int offset,
563 x86_insn_t * insn );
564
565/* x86_disasm_range: Sequential disassembly of a range of bytes in a buffer,
566 * invoking a callback function each time an instruction
567 * is successfully disassembled. The 'range' refers to the
568 * bytes between 'offset' and 'offset + len' in the buffer;
569 * 'len' is assumed to be less than the length of the buffer.
570 * Returns number of instructions processed.
571 * buf : Buffer of bytes to disassemble (e.g. .text section)
572 * buf_rva : Load address of buffer (e.g. ELF Virtual Address)
573 * offset : Offset in buffer to start disassembly at
574 * len : Number of bytes to disassemble
575 * func : Callback function to invoke (may be NULL)
576 * arg : Arbitrary data to pass to callback (may be NULL)
577 */
578unsigned int x86_disasm_range( unsigned char *buf, uint32_t buf_rva,
579 unsigned int offset, unsigned int len,
580 DISASM_CALLBACK func, void *arg );
581
582/* x86_disasm_forward: Flow-of-execution disassembly of the bytes in a buffer,
583 * invoking a callback function each time an instruction
584 * is successfully disassembled.
585 * buf : Buffer to disassemble (e.g. .text section)
586 * buf_len : Number of bytes in buffer
587 * buf_rva : Load address of buffer (e.g. ELF Virtual Address)
588 * offset : Offset in buffer to start disassembly at (e.g. entry point)
589 * func : Callback function to invoke (may be NULL)
590 * arg : Arbitrary data to pass to callback (may be NULL)
591 * resolver: Caller-supplied address resolver. If no resolver is
592 * supplied, a default internal one is used -- however the
593 * internal resolver does NOT catch loops and could end up
594 * disassembling forever..
595 * r_arg : Arbitrary data to pass to resolver (may be NULL)
596 */
597unsigned int x86_disasm_forward( unsigned char *buf, unsigned int buf_len,
598 uint32_t buf_rva, unsigned int offset,
599 DISASM_CALLBACK func, void *arg,
600 DISASM_RESOLVER resolver, void *r_arg );
601
602/* Instruction operands: these are stored as a list of explicit and
603 * implicit operands. It is recommended that the 'foreach' routines
604 * be used to when examining operands for purposes of data flow analysis */
605
606/* Operand FOREACH callback: 'arg' is an abritrary parameter passed to the
607 * foreach routine, 'insn' is the x86_insn_t whose operands are being
608 * iterated over, and 'op' is the current x86_op_t */
609typedef void (*x86_operand_fn)(x86_op_t *op, x86_insn_t *insn, void *arg);
610
611/* FOREACH types: these are used to limit the foreach results to
612 * operands which match a certain "type" (implicit or explicit)
613 * or which are accessed in certain ways (e.g. read or write). Note
614 * that this operates on the operand list of single instruction, so
615 * specifying the 'real' operand type (register, memory, etc) is not
616 * useful. Note also that by definition Execute Access implies Read
617 * Access and implies Not Write Access.
618 * The "type" (implicit or explicit) and the access method can
619 * be ORed together, e.g. op_wo | op_explicit */
620enum x86_op_foreach_type {
621 op_any = 0, /* ALL operands (explicit, implicit, rwx) */
622 op_dest = 1, /* operands with Write access */
623 op_src = 2, /* operands with Read access */
624 op_ro = 3, /* operands with Read but not Write access */
625 op_wo = 4, /* operands with Write but not Read access */
626 op_xo = 5, /* operands with Execute access */
627 op_rw = 6, /* operands with Read AND Write access */
628 op_implicit = 0x10, /* operands that are implied by the opcode */
629 op_explicit = 0x20 /* operands that are not side-effects */
630};
631
632
633/* free the operand list associated with an instruction -- useful for
634 * preventing memory leaks when free()ing an x86_insn_t */
635void x86_oplist_free( x86_insn_t *insn );
636
637/* Operand foreach: invokes 'func' with 'insn' and 'arg' as arguments. The
638 * 'type' parameter is used to select only operands matching specific
639 * criteria. */
640int x86_operand_foreach( x86_insn_t *insn, x86_operand_fn func, void *arg,
641 enum x86_op_foreach_type type);
642
643/* convenience routine: returns count of operands matching 'type' */
644size_t x86_operand_count( x86_insn_t *insn, enum x86_op_foreach_type type );
645
646/* accessor functions for the operands */
647x86_op_t * x86_operand_1st( x86_insn_t *insn );
648x86_op_t * x86_operand_2nd( x86_insn_t *insn );
649x86_op_t * x86_operand_3rd( x86_insn_t *insn );
650
651/* these allow libdisasm 2.0 accessor functions to still be used */
652#define x86_get_dest_operand( insn ) x86_operand_1st( insn )
653#define x86_get_src_operand( insn ) x86_operand_2nd( insn )
654#define x86_get_imm_operand( insn ) x86_operand_3rd( insn )
655
656/* get size of operand data in bytes */
657unsigned int x86_operand_size( x86_op_t *op );
658
659/* Operand Convenience Routines: the following three routines are common
660 * operations on operands, intended to ease the burden of the programmer. */
661
662/* Get Address: return the value of an offset operand, or the offset of
663 * a segment:offset absolute address */
664uint32_t x86_get_address( x86_insn_t *insn );
665
666/* Get Relative Offset: return as a sign-extended int32_t the near or far
667 * relative offset operand, or 0 if there is none. There can be only one
668 * relaive offset operand in an instruction. */
669int32_t x86_get_rel_offset( x86_insn_t *insn );
670
671/* Get Branch Target: return the x86_op_t containing the target of
672 * a jump or call operand, or NULL if there is no branch target.
673 * Internally, a 'branch target' is defined as any operand with
674 * Execute Access set. There can be only one branch target per instruction. */
675x86_op_t * x86_get_branch_target( x86_insn_t *insn );
676
677/* Get Immediate: return the x86_op_t containing the immediate operand
678 * for this instruction, or NULL if there is no immediate operand. There
679 * can be only one immediate operand per instruction */
680x86_op_t * x86_get_imm( x86_insn_t *insn );
681
682/* Get Raw Immediate Data: returns a pointer to the immediate data encoded
683 * in the instruction. This is useful for large data types [>32 bits] currently
684 * not supported by libdisasm, or for determining if the disassembler
685 * screwed up the conversion of the immediate data. Note that 'imm' in this
686 * context refers to immediate data encoded at the end of an instruction as
687 * detailed in the Intel Manual Vol II Chapter 2; it does not refer to the
688 * 'op_imm' operand (the third operand in instructions like 'mul' */
689unsigned char * x86_get_raw_imm( x86_insn_t *insn );
690
691
692/* More accessor fuctions, this time for user-defined info... */
693/* set the address (usually RVA) of the insn */
694void x86_set_insn_addr( x86_insn_t *insn, uint32_t addr );
695
696/* set the offset (usually offset into file) of the insn */
697void x86_set_insn_offset( x86_insn_t *insn, unsigned int offset );
698
699/* set a pointer to the function owning the instruction. The
700 * type of 'func' is user-defined; libdisasm does not use the func field. */
701void x86_set_insn_function( x86_insn_t *insn, void * func );
702
703/* set a pointer to the block of code owning the instruction. The
704 * type of 'block' is user-defined; libdisasm does not use the block field. */
705void x86_set_insn_block( x86_insn_t *insn, void * block );
706
707/* instruction tagging: these routines allow the programmer to mark
708 * instructions as "seen" in a DFS, for example. libdisasm does not use
709 * the tag field.*/
710/* set insn->tag to 1 */
711void x86_tag_insn( x86_insn_t *insn );
712/* set insn->tag to 0 */
713void x86_untag_insn( x86_insn_t *insn );
714/* return insn->tag */
715int x86_insn_is_tagged( x86_insn_t *insn );
716
717
718/* Disassembly formats:
719 * AT&T is standard AS/GAS-style: "mnemonic\tsrc, dest, imm"
720 * Intel is standard MASM/NASM/TASM: "mnemonic\tdest,src, imm"
721 * Native is tab-delimited: "RVA\tbytes\tmnemonic\tdest\tsrc\timm"
722 * XML is your typical <insn> ... </insn>
723 * Raw is addr|offset|size|bytes|prefix... see libdisasm_formats.7
724 */
725enum x86_asm_format {
726 unknown_syntax = 0, /* never use! */
727 native_syntax, /* header: 35 bytes */
728 intel_syntax, /* header: 23 bytes */
729 att_syntax, /* header: 23 bytes */
730 xml_syntax, /* header: 679 bytes */
731 raw_syntax /* header: 172 bytes */
732};
733
734/* format (sprintf) an operand into 'buf' using specified syntax */
735int x86_format_operand(x86_op_t *op, char *buf, int len,
736 enum x86_asm_format format);
737
738/* format (sprintf) an instruction mnemonic into 'buf' using specified syntax */
739int x86_format_mnemonic(x86_insn_t *insn, char *buf, int len,
740 enum x86_asm_format format);
741
742/* format (sprintf) an instruction into 'buf' using specified syntax;
743 * this includes formatting all operands */
744int x86_format_insn(x86_insn_t *insn, char *buf, int len, enum x86_asm_format);
745
746/* fill 'buf' with a description of the format's syntax */
747int x86_format_header( char *buf, int len, enum x86_asm_format format);
748
749/* Endianness of an x86 CPU : 0 is big, 1 is little; always returns 1 */
750unsigned int x86_endian(void);
751
752/* Default address and operand size in bytes */
753unsigned int x86_addr_size(void);
754unsigned int x86_op_size(void);
755
756/* Size of a machine word in bytes */
757unsigned int x86_word_size(void);
758
759/* maximum size of a code instruction */
760#define x86_max_inst_size(x) x86_max_insn_size(x)
761unsigned int x86_max_insn_size(void);
762
763/* register IDs of Stack, Frame, Instruction pointer and Flags register */
764unsigned int x86_sp_reg(void);
765unsigned int x86_fp_reg(void);
766unsigned int x86_ip_reg(void);
767unsigned int x86_flag_reg(void);
768
769/* fill 'reg' struct with details of register 'id' */
770void x86_reg_from_id( unsigned int id, x86_reg_t * reg );
771
772/* convenience macro demonstrating how to get an aliased register; proto is
773 * void x86_get_aliased_reg( x86_reg_t *alias_reg, x86_reg_t *output_reg )
774 * where 'alias_reg' is a reg operand and 'output_reg' is filled with the
775 * register that the operand is an alias for */
776#define x86_get_aliased_reg( alias_reg, output_reg ) \
777 x86_reg_from_id( alias_reg->alias, output_reg )
778
779
780/* ================================== Invariant Instruction Representation */
781/* Invariant instructions are used for generating binary signatures;
782 * the instruction is modified so that all variant bytes in an instruction
783 * are replaced with a wildcard byte.
784 *
785 * A 'variant byte' is one that is expected to be modified by either the
786 * static or the dynamic linker: for example, an address encoded in an
787 * instruction.
788 *
789 * By comparing the invariant representation of one instruction [or of a
790 * sequence of instructions] with the invariant representation of another,
791 * one determine whether the two invariant representations are from the same
792 * relocatable object [.o] file. Thus one can use binary signatures [which
793 * are just sequences of invariant instruction representations] to look for
794 * library routines which have been statically-linked into a binary.
795 *
796 * The invariant routines are faster and smaller than the disassembly
797 * routines; they can be used to determine the size of an instruction
798 * without all of the overhead of a full instruction disassembly.
799 */
800
801/* This byte is used to replace variant bytes */
802#define X86_WILDCARD_BYTE 0xF4
803
804typedef struct {
805 enum x86_op_type type; /* operand type */
806 enum x86_op_datatype datatype; /* operand size */
807 enum x86_op_access access; /* operand access [RWX] */
808 enum x86_op_flags flags; /* misc flags */
809} x86_invariant_op_t;
810
811typedef struct {
812 unsigned char bytes[64]; /* invariant representation */
813 unsigned int size; /* number of bytes in insn */
814 enum x86_insn_group group; /* meta-type, e.g. INS_EXEC */
815 enum x86_insn_type type; /* type, e.g. INS_BRANCH */
816 x86_invariant_op_t operands[3]; /* operands: dest, src, imm */
817} x86_invariant_t;
818
819
820/* return a version of the instruction with the variant bytes masked out */
821size_t x86_invariant_disasm( unsigned char *buf, int buf_len,
822 x86_invariant_t *inv );
823/* return the size in bytes of the intruction pointed to by 'buf';
824 * this used x86_invariant_disasm since it faster than x86_disasm */
825size_t x86_size_disasm( unsigned char *buf, unsigned int buf_len );
826
827#ifdef __cplusplus
828}
829#endif
830
831
832#endif
833