1 | #ifndef LIBDISASM_H |
2 | #define LIBDISASM_H |
3 | |
4 | #include <stdint.h> |
5 | |
6 | /* 'NEW" types |
7 | * __________________________________________________________________________*/ |
8 | #ifndef LIBDISASM_QWORD_H /* do not interfere with qword.h */ |
9 | #define LIBDISASM_QWORD_H |
10 | #ifdef _MSC_VER |
11 | typedef __int64 qword_t; |
12 | #else |
13 | typedef int64_t qword_t; |
14 | #endif |
15 | #endif |
16 | |
17 | #include <sys/types.h> |
18 | |
19 | #ifdef __cplusplus |
20 | extern "C" { |
21 | #endif |
22 | |
23 | /* 'NEW" x86 API |
24 | * __________________________________________________________________________*/ |
25 | |
26 | |
27 | /* ========================================= Error Reporting */ |
28 | /* REPORT CODES |
29 | * These are passed to a reporter function passed at initialization. |
30 | * Each code determines the type of the argument passed to the reporter; |
31 | * this allows the report to recover from errors, or just log them. |
32 | */ |
33 | enum x86_report_codes { |
34 | report_disasm_bounds, /* RVA OUT OF BOUNDS : The disassembler could |
35 | not disassemble the supplied RVA as it is |
36 | out of the range of the buffer. The |
37 | application should store the address and |
38 | attempt to determine what section of the |
39 | binary it is in, then disassemble the |
40 | address from the bytes in that section. |
41 | data: uint32_t rva */ |
42 | report_insn_bounds, /* INSTRUCTION OUT OF BOUNDS: The disassembler |
43 | could not disassemble the instruction as |
44 | the instruction would require bytes beyond |
45 | the end of the current buffer. This usually |
46 | indicated garbage bytes at the end of a |
47 | buffer, or an incorrectly-sized buffer. |
48 | data: uint32_t rva */ |
49 | report_invalid_insn, /* INVALID INSTRUCTION: The disassembler could |
50 | not disassemble the instruction as it has an |
51 | invalid combination of opcodes and operands. |
52 | This will stop automated disassembly; the |
53 | application can restart the disassembly |
54 | after the invalid instruction. |
55 | data: uint32_t rva */ |
56 | report_unknown |
57 | }; |
58 | |
59 | /* 'arg' is optional arbitrary data provided by the code passing the |
60 | * callback -- for example, it could be 'this' or 'self' in OOP code. |
61 | * 'code' is provided by libdisasm, it is one of the above |
62 | * 'data' is provided by libdisasm and is context-specific, per the enums */ |
63 | typedef void (*DISASM_REPORTER)( enum x86_report_codes code, |
64 | void *data, void *arg ); |
65 | |
66 | |
67 | /* x86_report_error : Call the register reporter to report an error */ |
68 | void x86_report_error( enum x86_report_codes code, void *data ); |
69 | |
70 | /* ========================================= Libdisasm Management Routines */ |
71 | enum x86_options { /* these can be ORed together */ |
72 | opt_none= 0, |
73 | opt_ignore_nulls=1, /* ignore sequences of > 4 NULL bytes */ |
74 | opt_16_bit=2, /* 16-bit/DOS disassembly */ |
75 | opt_att_mnemonics=4, /* use AT&T syntax names for alternate opcode mnemonics */ |
76 | }; |
77 | |
78 | /* management routines */ |
79 | /* 'arg' is caller-specific data which is passed as the first argument |
80 | * to the reporter callback routine */ |
81 | int x86_init( enum x86_options options, DISASM_REPORTER reporter, void *arg); |
82 | void x86_set_reporter( DISASM_REPORTER reporter, void *arg); |
83 | void x86_set_options( enum x86_options options ); |
84 | enum x86_options x86_get_options( void ); |
85 | int x86_cleanup(void); |
86 | |
87 | |
88 | /* ========================================= Instruction Representation */ |
89 | /* these defines are only intended for use in the array decl's */ |
90 | #define MAX_REGNAME 8 |
91 | |
92 | #define MAX_PREFIX_STR 32 |
93 | #define MAX_MNEM_STR 16 |
94 | #define MAX_INSN_SIZE 20 /* same as in i386.h */ |
95 | #define MAX_OP_STRING 32 /* max possible operand size in string form */ |
96 | #define MAX_OP_RAW_STRING 64 /* max possible operand size in raw form */ |
97 | #define MAX_OP_XML_STRING 256 /* max possible operand size in xml form */ |
98 | #define MAX_NUM_OPERANDS 8 /* max # implicit and explicit operands */ |
99 | /* in these, the '2 *' is arbitrary: the max # of operands should require |
100 | * more space than the rest of the insn */ |
101 | #define MAX_INSN_STRING 512 /* 2 * 8 * MAX_OP_STRING */ |
102 | #define MAX_INSN_RAW_STRING 1024 /* 2 * 8 * MAX_OP_RAW_STRING */ |
103 | #define MAX_INSN_XML_STRING 4096 /* 2 * 8 * MAX_OP_XML_STRING */ |
104 | |
105 | enum x86_reg_type { /* NOTE: these may be ORed together */ |
106 | reg_gen = 0x00001, /* general purpose */ |
107 | reg_in = 0x00002, /* incoming args, ala RISC */ |
108 | reg_out = 0x00004, /* args to calls, ala RISC */ |
109 | reg_local = 0x00008, /* local vars, ala RISC */ |
110 | reg_fpu = 0x00010, /* FPU data register */ |
111 | reg_seg = 0x00020, /* segment register */ |
112 | reg_simd = 0x00040, /* SIMD/MMX reg */ |
113 | reg_sys = 0x00080, /* restricted/system register */ |
114 | reg_sp = 0x00100, /* stack pointer */ |
115 | reg_fp = 0x00200, /* frame pointer */ |
116 | reg_pc = 0x00400, /* program counter */ |
117 | reg_retaddr = 0x00800, /* return addr for func */ |
118 | reg_cond = 0x01000, /* condition code / flags */ |
119 | reg_zero = 0x02000, /* zero register, ala RISC */ |
120 | reg_ret = 0x04000, /* return value */ |
121 | reg_src = 0x10000, /* array/rep source */ |
122 | reg_dest = 0x20000, /* array/rep destination */ |
123 | reg_count = 0x40000 /* array/rep/loop counter */ |
124 | }; |
125 | |
126 | /* x86_reg_t : an X86 CPU register */ |
127 | typedef struct { |
128 | char name[MAX_REGNAME]; |
129 | enum x86_reg_type type; /* what register is used for */ |
130 | unsigned int size; /* size of register in bytes */ |
131 | unsigned int id; /* register ID #, for quick compares */ |
132 | unsigned int alias; /* ID of reg this is an alias for */ |
133 | unsigned int shift; /* amount to shift aliased reg by */ |
134 | } x86_reg_t; |
135 | |
136 | /* x86_ea_t : an X86 effective address (address expression) */ |
137 | typedef struct { |
138 | unsigned int scale; /* scale factor */ |
139 | x86_reg_t index, base; /* index, base registers */ |
140 | int32_t disp; /* displacement */ |
141 | char disp_sign; /* is negative? 1/0 */ |
142 | char disp_size; /* 0, 1, 2, 4 */ |
143 | } x86_ea_t; |
144 | |
145 | /* x86_absolute_t : an X86 segment:offset address (descriptor) */ |
146 | typedef struct { |
147 | unsigned short segment; /* loaded directly into CS */ |
148 | union { |
149 | unsigned short off16; /* loaded directly into IP */ |
150 | uint32_t off32; /* loaded directly into EIP */ |
151 | } offset; |
152 | } x86_absolute_t; |
153 | |
154 | enum x86_op_type { /* mutually exclusive */ |
155 | op_unused = 0, /* empty/unused operand: should never occur */ |
156 | op_register = 1, /* CPU register */ |
157 | op_immediate = 2, /* Immediate Value */ |
158 | op_relative_near = 3, /* Relative offset from IP */ |
159 | op_relative_far = 4, /* Relative offset from IP */ |
160 | op_absolute = 5, /* Absolute address (ptr16:32) */ |
161 | op_expression = 6, /* Address expression (scale/index/base/disp) */ |
162 | op_offset = 7, /* Offset from start of segment (m32) */ |
163 | op_unknown |
164 | }; |
165 | |
166 | #define x86_optype_is_address( optype ) \ |
167 | ( optype == op_absolute || optype == op_offset ) |
168 | #define x86_optype_is_relative( optype ) \ |
169 | ( optype == op_relative_near || optype == op_relative_far ) |
170 | #define x86_optype_is_memory( optype ) \ |
171 | ( optype > op_immediate && optype < op_unknown ) |
172 | |
173 | enum x86_op_datatype { /* these use Intel's lame terminology */ |
174 | op_byte = 1, /* 1 byte integer */ |
175 | op_word = 2, /* 2 byte integer */ |
176 | op_dword = 3, /* 4 byte integer */ |
177 | op_qword = 4, /* 8 byte integer */ |
178 | op_dqword = 5, /* 16 byte integer */ |
179 | op_sreal = 6, /* 4 byte real (single real) */ |
180 | op_dreal = 7, /* 8 byte real (double real) */ |
181 | op_extreal = 8, /* 10 byte real (extended real) */ |
182 | op_bcd = 9, /* 10 byte binary-coded decimal */ |
183 | op_ssimd = 10, /* 16 byte : 4 packed single FP (SIMD, MMX) */ |
184 | op_dsimd = 11, /* 16 byte : 2 packed double FP (SIMD, MMX) */ |
185 | op_sssimd = 12, /* 4 byte : scalar single FP (SIMD, MMX) */ |
186 | op_sdsimd = 13, /* 8 byte : scalar double FP (SIMD, MMX) */ |
187 | op_descr32 = 14, /* 6 byte Intel descriptor 2:4 */ |
188 | op_descr16 = 15, /* 4 byte Intel descriptor 2:2 */ |
189 | op_pdescr32 = 16, /* 6 byte Intel pseudo-descriptor 32:16 */ |
190 | op_pdescr16 = 17, /* 6 byte Intel pseudo-descriptor 8:24:16 */ |
191 | op_bounds16 = 18, /* signed 16:16 lower:upper bounds */ |
192 | op_bounds32 = 19, /* signed 32:32 lower:upper bounds */ |
193 | op_fpuenv16 = 20, /* 14 byte FPU control/environment data */ |
194 | op_fpuenv32 = 21, /* 28 byte FPU control/environment data */ |
195 | op_fpustate16 = 22, /* 94 byte FPU state (env & reg stack) */ |
196 | op_fpustate32 = 23, /* 108 byte FPU state (env & reg stack) */ |
197 | op_fpregset = 24, /* 512 bytes: register set */ |
198 | op_fpreg = 25, /* FPU register */ |
199 | op_none = 0xFF, /* operand without a datatype (INVLPG) */ |
200 | }; |
201 | |
202 | enum x86_op_access { /* ORed together */ |
203 | op_read = 1, |
204 | op_write = 2, |
205 | op_execute = 4 |
206 | }; |
207 | |
208 | enum x86_op_flags { /* ORed together, but segs are mutually exclusive */ |
209 | op_signed = 1, /* signed integer */ |
210 | op_string = 2, /* possible string or array */ |
211 | op_constant = 4, /* symbolic constant */ |
212 | op_pointer = 8, /* operand points to a memory address */ |
213 | op_sysref = 0x010, /* operand is a syscall number */ |
214 | op_implied = 0x020, /* operand is implicit in the insn */ |
215 | op_hardcode = 0x40, /* operand is hardcoded in insn definition */ |
216 | /* NOTE: an 'implied' operand is one which can be considered a side |
217 | * effect of the insn, e.g. %esp being modified by PUSH or POP. A |
218 | * 'hard-coded' operand is one which is specified in the instruction |
219 | * definition, e.g. %es:%edi in MOVSB or 1 in ROL Eb, 1. The difference |
220 | * is that hard-coded operands are printed by disassemblers and are |
221 | * required to re-assemble, while implicit operands are invisible. */ |
222 | op_es_seg = 0x100, /* ES segment override */ |
223 | op_cs_seg = 0x200, /* CS segment override */ |
224 | op_ss_seg = 0x300, /* SS segment override */ |
225 | op_ds_seg = 0x400, /* DS segment override */ |
226 | op_fs_seg = 0x500, /* FS segment override */ |
227 | op_gs_seg = 0x600 /* GS segment override */ |
228 | }; |
229 | |
230 | /* x86_op_t : an X86 instruction operand */ |
231 | typedef struct { |
232 | enum x86_op_type type; /* operand type */ |
233 | enum x86_op_datatype datatype; /* operand size */ |
234 | enum x86_op_access access; /* operand access [RWX] */ |
235 | enum x86_op_flags flags; /* misc flags */ |
236 | union { |
237 | /* sizeof will have to work on these union members! */ |
238 | /* immediate values */ |
239 | char sbyte; |
240 | short sword; |
241 | int32_t sdword; |
242 | qword_t sqword; |
243 | unsigned char byte; |
244 | unsigned short word; |
245 | uint32_t dword; |
246 | qword_t qword; |
247 | float sreal; |
248 | double dreal; |
249 | /* misc large/non-native types */ |
250 | unsigned char extreal[10]; |
251 | unsigned char bcd[10]; |
252 | qword_t dqword[2]; |
253 | unsigned char simd[16]; |
254 | unsigned char fpuenv[28]; |
255 | /* offset from segment */ |
256 | uint32_t offset; |
257 | /* ID of CPU register */ |
258 | x86_reg_t reg; |
259 | /* offsets from current insn */ |
260 | char relative_near; |
261 | int32_t relative_far; |
262 | /* segment:offset */ |
263 | x86_absolute_t absolute; |
264 | /* effective address [expression] */ |
265 | x86_ea_t expression; |
266 | } data; |
267 | /* this is needed to make formatting operands more sane */ |
268 | void * insn; /* pointer to x86_insn_t owning operand */ |
269 | } x86_op_t; |
270 | |
271 | /* Linked list of x86_op_t; provided for manual traversal of the operand |
272 | * list in an insn. Users wishing to add operands to this list, e.g. to add |
273 | * implicit operands, should use x86_operand_new in x86_operand_list.h */ |
274 | typedef struct x86_operand_list { |
275 | x86_op_t op; |
276 | struct x86_operand_list *next; |
277 | } x86_oplist_t; |
278 | |
279 | enum x86_insn_group { |
280 | insn_none = 0, /* invalid instruction */ |
281 | insn_controlflow = 1, |
282 | insn_arithmetic = 2, |
283 | insn_logic = 3, |
284 | insn_stack = 4, |
285 | insn_comparison = 5, |
286 | insn_move = 6, |
287 | insn_string = 7, |
288 | insn_bit_manip = 8, |
289 | insn_flag_manip = 9, |
290 | insn_fpu = 10, |
291 | insn_interrupt = 13, |
292 | insn_system = 14, |
293 | insn_other = 15 |
294 | }; |
295 | |
296 | enum x86_insn_type { |
297 | insn_invalid = 0, /* invalid instruction */ |
298 | /* insn_controlflow */ |
299 | insn_jmp = 0x1001, |
300 | insn_jcc = 0x1002, |
301 | insn_call = 0x1003, |
302 | insn_callcc = 0x1004, |
303 | insn_return = 0x1005, |
304 | /* insn_arithmetic */ |
305 | insn_add = 0x2001, |
306 | insn_sub = 0x2002, |
307 | insn_mul = 0x2003, |
308 | insn_div = 0x2004, |
309 | insn_inc = 0x2005, |
310 | insn_dec = 0x2006, |
311 | insn_shl = 0x2007, |
312 | insn_shr = 0x2008, |
313 | insn_rol = 0x2009, |
314 | insn_ror = 0x200A, |
315 | /* insn_logic */ |
316 | insn_and = 0x3001, |
317 | insn_or = 0x3002, |
318 | insn_xor = 0x3003, |
319 | insn_not = 0x3004, |
320 | insn_neg = 0x3005, |
321 | /* insn_stack */ |
322 | insn_push = 0x4001, |
323 | insn_pop = 0x4002, |
324 | insn_pushregs = 0x4003, |
325 | insn_popregs = 0x4004, |
326 | insn_pushflags = 0x4005, |
327 | insn_popflags = 0x4006, |
328 | insn_enter = 0x4007, |
329 | insn_leave = 0x4008, |
330 | /* insn_comparison */ |
331 | insn_test = 0x5001, |
332 | insn_cmp = 0x5002, |
333 | /* insn_move */ |
334 | insn_mov = 0x6001, /* move */ |
335 | insn_movcc = 0x6002, /* conditional move */ |
336 | insn_xchg = 0x6003, /* exchange */ |
337 | insn_xchgcc = 0x6004, /* conditional exchange */ |
338 | /* insn_string */ |
339 | insn_strcmp = 0x7001, |
340 | insn_strload = 0x7002, |
341 | insn_strmov = 0x7003, |
342 | insn_strstore = 0x7004, |
343 | insn_translate = 0x7005, /* xlat */ |
344 | /* insn_bit_manip */ |
345 | insn_bittest = 0x8001, |
346 | insn_bitset = 0x8002, |
347 | insn_bitclear = 0x8003, |
348 | /* insn_flag_manip */ |
349 | insn_clear_carry = 0x9001, |
350 | insn_clear_zero = 0x9002, |
351 | insn_clear_oflow = 0x9003, |
352 | insn_clear_dir = 0x9004, |
353 | insn_clear_sign = 0x9005, |
354 | insn_clear_parity = 0x9006, |
355 | insn_set_carry = 0x9007, |
356 | insn_set_zero = 0x9008, |
357 | insn_set_oflow = 0x9009, |
358 | insn_set_dir = 0x900A, |
359 | insn_set_sign = 0x900B, |
360 | insn_set_parity = 0x900C, |
361 | insn_tog_carry = 0x9010, |
362 | insn_tog_zero = 0x9020, |
363 | insn_tog_oflow = 0x9030, |
364 | insn_tog_dir = 0x9040, |
365 | insn_tog_sign = 0x9050, |
366 | insn_tog_parity = 0x9060, |
367 | /* insn_fpu */ |
368 | insn_fmov = 0xA001, |
369 | insn_fmovcc = 0xA002, |
370 | insn_fneg = 0xA003, |
371 | insn_fabs = 0xA004, |
372 | insn_fadd = 0xA005, |
373 | insn_fsub = 0xA006, |
374 | insn_fmul = 0xA007, |
375 | insn_fdiv = 0xA008, |
376 | insn_fsqrt = 0xA009, |
377 | insn_fcmp = 0xA00A, |
378 | insn_fcos = 0xA00C, |
379 | insn_fldpi = 0xA00D, |
380 | insn_fldz = 0xA00E, |
381 | insn_ftan = 0xA00F, |
382 | insn_fsine = 0xA010, |
383 | insn_fsys = 0xA020, |
384 | /* insn_interrupt */ |
385 | insn_int = 0xD001, |
386 | insn_intcc = 0xD002, /* not present in x86 ISA */ |
387 | insn_iret = 0xD003, |
388 | insn_bound = 0xD004, |
389 | insn_debug = 0xD005, |
390 | insn_trace = 0xD006, |
391 | insn_invalid_op = 0xD007, |
392 | insn_oflow = 0xD008, |
393 | /* insn_system */ |
394 | insn_halt = 0xE001, |
395 | insn_in = 0xE002, /* input from port/bus */ |
396 | insn_out = 0xE003, /* output to port/bus */ |
397 | insn_cpuid = 0xE004, |
398 | /* insn_other */ |
399 | insn_nop = 0xF001, |
400 | insn_bcdconv = 0xF002, /* convert to or from BCD */ |
401 | insn_szconv = 0xF003 /* change size of operand */ |
402 | }; |
403 | |
404 | /* These flags specify special characteristics of the instruction, such as |
405 | * whether the inatruction is privileged or whether it serializes the |
406 | * pipeline. |
407 | * NOTE : These may not be accurate for all instructions; updates to the |
408 | * opcode tables have not been completed. */ |
409 | enum x86_insn_note { |
410 | insn_note_ring0 = 1, /* Only available in ring 0 */ |
411 | insn_note_smm = 2, /* "" in System Management Mode */ |
412 | insn_note_serial = 4, /* Serializing instruction */ |
413 | insn_note_nonswap = 8, /* Does not swap arguments in att-style formatting */ |
414 | insn_note_nosuffix = 16, /* Does not have size suffix in att-style formatting */ |
415 | }; |
416 | |
417 | /* This specifies what effects the instruction has on the %eflags register */ |
418 | enum x86_flag_status { |
419 | insn_carry_set = 0x1, /* CF */ |
420 | insn_zero_set = 0x2, /* ZF */ |
421 | insn_oflow_set = 0x4, /* OF */ |
422 | insn_dir_set = 0x8, /* DF */ |
423 | insn_sign_set = 0x10, /* SF */ |
424 | insn_parity_set = 0x20, /* PF */ |
425 | insn_carry_or_zero_set = 0x40, |
426 | insn_zero_set_or_sign_ne_oflow = 0x80, |
427 | insn_carry_clear = 0x100, |
428 | insn_zero_clear = 0x200, |
429 | insn_oflow_clear = 0x400, |
430 | insn_dir_clear = 0x800, |
431 | insn_sign_clear = 0x1000, |
432 | insn_parity_clear = 0x2000, |
433 | insn_sign_eq_oflow = 0x4000, |
434 | insn_sign_ne_oflow = 0x8000 |
435 | }; |
436 | |
437 | /* The CPU model in which the insturction first appeared; this can be used |
438 | * to mask out instructions appearing in earlier or later models or to |
439 | * check the portability of a binary. |
440 | * NOTE : These may not be accurate for all instructions; updates to the |
441 | * opcode tables have not been completed. */ |
442 | enum x86_insn_cpu { |
443 | cpu_8086 = 1, /* Intel */ |
444 | cpu_80286 = 2, |
445 | cpu_80386 = 3, |
446 | cpu_80387 = 4, |
447 | cpu_80486 = 5, |
448 | cpu_pentium = 6, |
449 | cpu_pentiumpro = 7, |
450 | cpu_pentium2 = 8, |
451 | cpu_pentium3 = 9, |
452 | cpu_pentium4 = 10, |
453 | cpu_k6 = 16, /* AMD */ |
454 | cpu_k7 = 32, |
455 | cpu_athlon = 48 |
456 | }; |
457 | |
458 | /* CPU ISA subsets: These are derived from the Instruction Groups in |
459 | * Intel Vol 1 Chapter 5; they represent subsets of the IA32 ISA but |
460 | * do not reflect the 'type' of the instruction in the same way that |
461 | * x86_insn_group does. In short, these are AMD/Intel's somewhat useless |
462 | * designations. |
463 | * NOTE : These may not be accurate for all instructions; updates to the |
464 | * opcode tables have not been completed. */ |
465 | enum x86_insn_isa { |
466 | isa_gp = 1, /* general purpose */ |
467 | isa_fp = 2, /* floating point */ |
468 | isa_fpumgt = 3, /* FPU/SIMD management */ |
469 | isa_mmx = 4, /* Intel MMX */ |
470 | isa_sse1 = 5, /* Intel SSE SIMD */ |
471 | isa_sse2 = 6, /* Intel SSE2 SIMD */ |
472 | isa_sse3 = 7, /* Intel SSE3 SIMD */ |
473 | isa_3dnow = 8, /* AMD 3DNow! SIMD */ |
474 | isa_sys = 9 /* system instructions */ |
475 | }; |
476 | |
477 | enum x86_insn_prefix { |
478 | insn_no_prefix = 0, |
479 | insn_rep_zero = 1, /* REPZ and REPE */ |
480 | insn_rep_notzero = 2, /* REPNZ and REPNZ */ |
481 | insn_lock = 4 /* LOCK: */ |
482 | }; |
483 | |
484 | /* TODO: maybe provide insn_new/free(), and have disasm return new insn_t */ |
485 | /* x86_insn_t : an X86 instruction */ |
486 | typedef struct { |
487 | /* information about the instruction */ |
488 | uint32_t addr; /* load address */ |
489 | uint32_t offset; /* offset into file/buffer */ |
490 | enum x86_insn_group group; /* meta-type, e.g. INS_EXEC */ |
491 | enum x86_insn_type type; /* type, e.g. INS_BRANCH */ |
492 | enum x86_insn_note note; /* note, e.g. RING0 */ |
493 | unsigned char bytes[MAX_INSN_SIZE]; |
494 | unsigned char size; /* size of insn in bytes */ |
495 | /* 16/32-bit mode settings */ |
496 | unsigned char addr_size; /* default address size : 2 or 4 */ |
497 | unsigned char op_size; /* default operand size : 2 or 4 */ |
498 | /* CPU/instruction set */ |
499 | enum x86_insn_cpu cpu; |
500 | enum x86_insn_isa isa; |
501 | /* flags */ |
502 | enum x86_flag_status flags_set; /* flags set or tested by insn */ |
503 | enum x86_flag_status flags_tested; |
504 | /* stack */ |
505 | unsigned char stack_mod; /* 0 or 1 : is the stack modified? */ |
506 | int32_t stack_mod_val; /* val stack is modified by if known */ |
507 | |
508 | /* the instruction proper */ |
509 | enum x86_insn_prefix prefix; /* prefixes ORed together */ |
510 | char prefix_string[MAX_PREFIX_STR]; /* prefixes [might be truncated] */ |
511 | char mnemonic[MAX_MNEM_STR]; |
512 | x86_oplist_t *operands; /* list of explicit/implicit operands */ |
513 | size_t operand_count; /* total number of operands */ |
514 | size_t explicit_count; /* number of explicit operands */ |
515 | /* convenience fields for user */ |
516 | void *block; /* code block containing this insn */ |
517 | void *function; /* function containing this insn */ |
518 | int tag; /* tag the insn as seen/processed */ |
519 | } x86_insn_t; |
520 | |
521 | |
522 | /* returns 0 if an instruction is invalid, 1 if valid */ |
523 | int x86_insn_is_valid( x86_insn_t *insn ); |
524 | |
525 | /* DISASSEMBLY ROUTINES |
526 | * Canonical order of arguments is |
527 | * (buf, buf_len, buf_rva, offset, len, insn, func, arg, resolve_func) |
528 | * ...but of course all of these are not used at the same time. |
529 | */ |
530 | |
531 | |
532 | /* Function prototype for caller-supplied callback routine |
533 | * These callbacks are intended to process 'insn' further, e.g. by |
534 | * adding it to a linked list, database, etc */ |
535 | typedef void (*DISASM_CALLBACK)( x86_insn_t *insn, void * arg ); |
536 | |
537 | /* Function prototype for caller-supplied address resolver. |
538 | * This routine is used to determine the rva to disassemble next, given |
539 | * the 'dest' operand of a jump/call. This allows the caller to resolve |
540 | * jump/call targets stored in a register or on the stack, and also allows |
541 | * the caller to prevent endless loops by checking if an address has |
542 | * already been disassembled. If an address cannot be resolved from the |
543 | * operand, or if the address has already been disassembled, this routine |
544 | * should return -1; in all other cases the RVA to be disassembled next |
545 | * should be returned. */ |
546 | typedef int32_t (*DISASM_RESOLVER)( x86_op_t *op, x86_insn_t * current_insn, |
547 | void *arg ); |
548 | |
549 | |
550 | /* x86_disasm: Disassemble a single instruction from a buffer of bytes. |
551 | * Returns size of instruction in bytes. |
552 | * Caller is responsible for calling x86_oplist_free() on |
553 | * a reused "insn" to avoid leaking memory when calling this |
554 | * function repeatedly. |
555 | * buf : Buffer of bytes to disassemble |
556 | * buf_len : Length of the buffer |
557 | * buf_rva : Load address of the start of the buffer |
558 | * offset : Offset in buffer to disassemble |
559 | * insn : Structure to fill with disassembled instruction |
560 | */ |
561 | unsigned int x86_disasm( unsigned char *buf, unsigned int buf_len, |
562 | uint32_t buf_rva, unsigned int offset, |
563 | x86_insn_t * insn ); |
564 | |
565 | /* x86_disasm_range: Sequential disassembly of a range of bytes in a buffer, |
566 | * invoking a callback function each time an instruction |
567 | * is successfully disassembled. The 'range' refers to the |
568 | * bytes between 'offset' and 'offset + len' in the buffer; |
569 | * 'len' is assumed to be less than the length of the buffer. |
570 | * Returns number of instructions processed. |
571 | * buf : Buffer of bytes to disassemble (e.g. .text section) |
572 | * buf_rva : Load address of buffer (e.g. ELF Virtual Address) |
573 | * offset : Offset in buffer to start disassembly at |
574 | * len : Number of bytes to disassemble |
575 | * func : Callback function to invoke (may be NULL) |
576 | * arg : Arbitrary data to pass to callback (may be NULL) |
577 | */ |
578 | unsigned int x86_disasm_range( unsigned char *buf, uint32_t buf_rva, |
579 | unsigned int offset, unsigned int len, |
580 | DISASM_CALLBACK func, void *arg ); |
581 | |
582 | /* x86_disasm_forward: Flow-of-execution disassembly of the bytes in a buffer, |
583 | * invoking a callback function each time an instruction |
584 | * is successfully disassembled. |
585 | * buf : Buffer to disassemble (e.g. .text section) |
586 | * buf_len : Number of bytes in buffer |
587 | * buf_rva : Load address of buffer (e.g. ELF Virtual Address) |
588 | * offset : Offset in buffer to start disassembly at (e.g. entry point) |
589 | * func : Callback function to invoke (may be NULL) |
590 | * arg : Arbitrary data to pass to callback (may be NULL) |
591 | * resolver: Caller-supplied address resolver. If no resolver is |
592 | * supplied, a default internal one is used -- however the |
593 | * internal resolver does NOT catch loops and could end up |
594 | * disassembling forever.. |
595 | * r_arg : Arbitrary data to pass to resolver (may be NULL) |
596 | */ |
597 | unsigned int x86_disasm_forward( unsigned char *buf, unsigned int buf_len, |
598 | uint32_t buf_rva, unsigned int offset, |
599 | DISASM_CALLBACK func, void *arg, |
600 | DISASM_RESOLVER resolver, void *r_arg ); |
601 | |
602 | /* Instruction operands: these are stored as a list of explicit and |
603 | * implicit operands. It is recommended that the 'foreach' routines |
604 | * be used to when examining operands for purposes of data flow analysis */ |
605 | |
606 | /* Operand FOREACH callback: 'arg' is an abritrary parameter passed to the |
607 | * foreach routine, 'insn' is the x86_insn_t whose operands are being |
608 | * iterated over, and 'op' is the current x86_op_t */ |
609 | typedef void (*x86_operand_fn)(x86_op_t *op, x86_insn_t *insn, void *arg); |
610 | |
611 | /* FOREACH types: these are used to limit the foreach results to |
612 | * operands which match a certain "type" (implicit or explicit) |
613 | * or which are accessed in certain ways (e.g. read or write). Note |
614 | * that this operates on the operand list of single instruction, so |
615 | * specifying the 'real' operand type (register, memory, etc) is not |
616 | * useful. Note also that by definition Execute Access implies Read |
617 | * Access and implies Not Write Access. |
618 | * The "type" (implicit or explicit) and the access method can |
619 | * be ORed together, e.g. op_wo | op_explicit */ |
620 | enum x86_op_foreach_type { |
621 | op_any = 0, /* ALL operands (explicit, implicit, rwx) */ |
622 | op_dest = 1, /* operands with Write access */ |
623 | op_src = 2, /* operands with Read access */ |
624 | op_ro = 3, /* operands with Read but not Write access */ |
625 | op_wo = 4, /* operands with Write but not Read access */ |
626 | op_xo = 5, /* operands with Execute access */ |
627 | op_rw = 6, /* operands with Read AND Write access */ |
628 | op_implicit = 0x10, /* operands that are implied by the opcode */ |
629 | op_explicit = 0x20 /* operands that are not side-effects */ |
630 | }; |
631 | |
632 | |
633 | /* free the operand list associated with an instruction -- useful for |
634 | * preventing memory leaks when free()ing an x86_insn_t */ |
635 | void x86_oplist_free( x86_insn_t *insn ); |
636 | |
637 | /* Operand foreach: invokes 'func' with 'insn' and 'arg' as arguments. The |
638 | * 'type' parameter is used to select only operands matching specific |
639 | * criteria. */ |
640 | int x86_operand_foreach( x86_insn_t *insn, x86_operand_fn func, void *arg, |
641 | enum x86_op_foreach_type type); |
642 | |
643 | /* convenience routine: returns count of operands matching 'type' */ |
644 | size_t x86_operand_count( x86_insn_t *insn, enum x86_op_foreach_type type ); |
645 | |
646 | /* accessor functions for the operands */ |
647 | x86_op_t * x86_operand_1st( x86_insn_t *insn ); |
648 | x86_op_t * x86_operand_2nd( x86_insn_t *insn ); |
649 | x86_op_t * x86_operand_3rd( x86_insn_t *insn ); |
650 | |
651 | /* these allow libdisasm 2.0 accessor functions to still be used */ |
652 | #define x86_get_dest_operand( insn ) x86_operand_1st( insn ) |
653 | #define x86_get_src_operand( insn ) x86_operand_2nd( insn ) |
654 | #define x86_get_imm_operand( insn ) x86_operand_3rd( insn ) |
655 | |
656 | /* get size of operand data in bytes */ |
657 | unsigned int x86_operand_size( x86_op_t *op ); |
658 | |
659 | /* Operand Convenience Routines: the following three routines are common |
660 | * operations on operands, intended to ease the burden of the programmer. */ |
661 | |
662 | /* Get Address: return the value of an offset operand, or the offset of |
663 | * a segment:offset absolute address */ |
664 | uint32_t x86_get_address( x86_insn_t *insn ); |
665 | |
666 | /* Get Relative Offset: return as a sign-extended int32_t the near or far |
667 | * relative offset operand, or 0 if there is none. There can be only one |
668 | * relaive offset operand in an instruction. */ |
669 | int32_t x86_get_rel_offset( x86_insn_t *insn ); |
670 | |
671 | /* Get Branch Target: return the x86_op_t containing the target of |
672 | * a jump or call operand, or NULL if there is no branch target. |
673 | * Internally, a 'branch target' is defined as any operand with |
674 | * Execute Access set. There can be only one branch target per instruction. */ |
675 | x86_op_t * x86_get_branch_target( x86_insn_t *insn ); |
676 | |
677 | /* Get Immediate: return the x86_op_t containing the immediate operand |
678 | * for this instruction, or NULL if there is no immediate operand. There |
679 | * can be only one immediate operand per instruction */ |
680 | x86_op_t * x86_get_imm( x86_insn_t *insn ); |
681 | |
682 | /* Get Raw Immediate Data: returns a pointer to the immediate data encoded |
683 | * in the instruction. This is useful for large data types [>32 bits] currently |
684 | * not supported by libdisasm, or for determining if the disassembler |
685 | * screwed up the conversion of the immediate data. Note that 'imm' in this |
686 | * context refers to immediate data encoded at the end of an instruction as |
687 | * detailed in the Intel Manual Vol II Chapter 2; it does not refer to the |
688 | * 'op_imm' operand (the third operand in instructions like 'mul' */ |
689 | unsigned char * x86_get_raw_imm( x86_insn_t *insn ); |
690 | |
691 | |
692 | /* More accessor fuctions, this time for user-defined info... */ |
693 | /* set the address (usually RVA) of the insn */ |
694 | void x86_set_insn_addr( x86_insn_t *insn, uint32_t addr ); |
695 | |
696 | /* set the offset (usually offset into file) of the insn */ |
697 | void x86_set_insn_offset( x86_insn_t *insn, unsigned int offset ); |
698 | |
699 | /* set a pointer to the function owning the instruction. The |
700 | * type of 'func' is user-defined; libdisasm does not use the func field. */ |
701 | void x86_set_insn_function( x86_insn_t *insn, void * func ); |
702 | |
703 | /* set a pointer to the block of code owning the instruction. The |
704 | * type of 'block' is user-defined; libdisasm does not use the block field. */ |
705 | void x86_set_insn_block( x86_insn_t *insn, void * block ); |
706 | |
707 | /* instruction tagging: these routines allow the programmer to mark |
708 | * instructions as "seen" in a DFS, for example. libdisasm does not use |
709 | * the tag field.*/ |
710 | /* set insn->tag to 1 */ |
711 | void x86_tag_insn( x86_insn_t *insn ); |
712 | /* set insn->tag to 0 */ |
713 | void x86_untag_insn( x86_insn_t *insn ); |
714 | /* return insn->tag */ |
715 | int x86_insn_is_tagged( x86_insn_t *insn ); |
716 | |
717 | |
718 | /* Disassembly formats: |
719 | * AT&T is standard AS/GAS-style: "mnemonic\tsrc, dest, imm" |
720 | * Intel is standard MASM/NASM/TASM: "mnemonic\tdest,src, imm" |
721 | * Native is tab-delimited: "RVA\tbytes\tmnemonic\tdest\tsrc\timm" |
722 | * XML is your typical <insn> ... </insn> |
723 | * Raw is addr|offset|size|bytes|prefix... see libdisasm_formats.7 |
724 | */ |
725 | enum x86_asm_format { |
726 | unknown_syntax = 0, /* never use! */ |
727 | native_syntax, /* header: 35 bytes */ |
728 | intel_syntax, /* header: 23 bytes */ |
729 | att_syntax, /* header: 23 bytes */ |
730 | xml_syntax, /* header: 679 bytes */ |
731 | raw_syntax /* header: 172 bytes */ |
732 | }; |
733 | |
734 | /* format (sprintf) an operand into 'buf' using specified syntax */ |
735 | int x86_format_operand(x86_op_t *op, char *buf, int len, |
736 | enum x86_asm_format format); |
737 | |
738 | /* format (sprintf) an instruction mnemonic into 'buf' using specified syntax */ |
739 | int x86_format_mnemonic(x86_insn_t *insn, char *buf, int len, |
740 | enum x86_asm_format format); |
741 | |
742 | /* format (sprintf) an instruction into 'buf' using specified syntax; |
743 | * this includes formatting all operands */ |
744 | int x86_format_insn(x86_insn_t *insn, char *buf, int len, enum x86_asm_format); |
745 | |
746 | /* fill 'buf' with a description of the format's syntax */ |
747 | int ( char *buf, int len, enum x86_asm_format format); |
748 | |
749 | /* Endianness of an x86 CPU : 0 is big, 1 is little; always returns 1 */ |
750 | unsigned int x86_endian(void); |
751 | |
752 | /* Default address and operand size in bytes */ |
753 | unsigned int x86_addr_size(void); |
754 | unsigned int x86_op_size(void); |
755 | |
756 | /* Size of a machine word in bytes */ |
757 | unsigned int x86_word_size(void); |
758 | |
759 | /* maximum size of a code instruction */ |
760 | #define x86_max_inst_size(x) x86_max_insn_size(x) |
761 | unsigned int x86_max_insn_size(void); |
762 | |
763 | /* register IDs of Stack, Frame, Instruction pointer and Flags register */ |
764 | unsigned int x86_sp_reg(void); |
765 | unsigned int x86_fp_reg(void); |
766 | unsigned int x86_ip_reg(void); |
767 | unsigned int x86_flag_reg(void); |
768 | |
769 | /* fill 'reg' struct with details of register 'id' */ |
770 | void x86_reg_from_id( unsigned int id, x86_reg_t * reg ); |
771 | |
772 | /* convenience macro demonstrating how to get an aliased register; proto is |
773 | * void x86_get_aliased_reg( x86_reg_t *alias_reg, x86_reg_t *output_reg ) |
774 | * where 'alias_reg' is a reg operand and 'output_reg' is filled with the |
775 | * register that the operand is an alias for */ |
776 | #define x86_get_aliased_reg( alias_reg, output_reg ) \ |
777 | x86_reg_from_id( alias_reg->alias, output_reg ) |
778 | |
779 | |
780 | /* ================================== Invariant Instruction Representation */ |
781 | /* Invariant instructions are used for generating binary signatures; |
782 | * the instruction is modified so that all variant bytes in an instruction |
783 | * are replaced with a wildcard byte. |
784 | * |
785 | * A 'variant byte' is one that is expected to be modified by either the |
786 | * static or the dynamic linker: for example, an address encoded in an |
787 | * instruction. |
788 | * |
789 | * By comparing the invariant representation of one instruction [or of a |
790 | * sequence of instructions] with the invariant representation of another, |
791 | * one determine whether the two invariant representations are from the same |
792 | * relocatable object [.o] file. Thus one can use binary signatures [which |
793 | * are just sequences of invariant instruction representations] to look for |
794 | * library routines which have been statically-linked into a binary. |
795 | * |
796 | * The invariant routines are faster and smaller than the disassembly |
797 | * routines; they can be used to determine the size of an instruction |
798 | * without all of the overhead of a full instruction disassembly. |
799 | */ |
800 | |
801 | /* This byte is used to replace variant bytes */ |
802 | #define X86_WILDCARD_BYTE 0xF4 |
803 | |
804 | typedef struct { |
805 | enum x86_op_type type; /* operand type */ |
806 | enum x86_op_datatype datatype; /* operand size */ |
807 | enum x86_op_access access; /* operand access [RWX] */ |
808 | enum x86_op_flags flags; /* misc flags */ |
809 | } x86_invariant_op_t; |
810 | |
811 | typedef struct { |
812 | unsigned char bytes[64]; /* invariant representation */ |
813 | unsigned int size; /* number of bytes in insn */ |
814 | enum x86_insn_group group; /* meta-type, e.g. INS_EXEC */ |
815 | enum x86_insn_type type; /* type, e.g. INS_BRANCH */ |
816 | x86_invariant_op_t operands[3]; /* operands: dest, src, imm */ |
817 | } x86_invariant_t; |
818 | |
819 | |
820 | /* return a version of the instruction with the variant bytes masked out */ |
821 | size_t x86_invariant_disasm( unsigned char *buf, int buf_len, |
822 | x86_invariant_t *inv ); |
823 | /* return the size in bytes of the intruction pointed to by 'buf'; |
824 | * this used x86_invariant_disasm since it faster than x86_disasm */ |
825 | size_t x86_size_disasm( unsigned char *buf, unsigned int buf_len ); |
826 | |
827 | #ifdef __cplusplus |
828 | } |
829 | #endif |
830 | |
831 | |
832 | #endif |
833 | |