bytereader.h source code [breakpad/common/dwarf/bytereader.h]

1	// -- mode: C++ --
2
3	// Copyright (c) 2010 Google Inc. All Rights Reserved.
4	//
5	// Redistribution and use in source and binary forms, with or without
6	// modification, are permitted provided that the following conditions are
7	// met:
8	//
9	// Redistributions of source code must retain the above copyright*
10	// notice, this list of conditions and the following disclaimer.
11	// Redistributions in binary form must reproduce the above*
12	// copyright notice, this list of conditions and the following disclaimer
13	// in the documentation and/or other materials provided with the
14	// distribution.
15	// Neither the name of Google Inc. nor the names of its*
16	// contributors may be used to endorse or promote products derived from
17	// this software without specific prior written permission.
18	//
19	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31	#ifndef COMMON_DWARF_BYTEREADER_H__
32	#define COMMON_DWARF_BYTEREADER_H__
33
34	#include <stdint.h>
35
36	#include <string>
37
38	#include "common/dwarf/types.h"
39	#include "common/dwarf/dwarf2enums.h"
40
41	namespace google_breakpad {
42
43	// We can't use the obvious name of LITTLE_ENDIAN and BIG_ENDIAN
44	// because it conflicts with a macro
45	enum Endianness {
46	ENDIANNESS_BIG,
47	ENDIANNESS_LITTLE
48	};
49
50	// A ByteReader knows how to read single- and multi-byte values of
51	// various endiannesses, sizes, and encodings, as used in DWARF
52	// debugging information and Linux C++ exception handling data.
53	class ByteReader {
54	public:
55	// Construct a ByteReader capable of reading one-, two-, four-, and
56	// eight-byte values according to ENDIANNESS, absolute machine-sized
57	// addresses, DWARF-style "initial length" values, signed and
58	// unsigned LEB128 numbers, and Linux C++ exception handling data's
59	// encoded pointers.
60	explicit ByteReader(enum Endianness endianness);
61	virtual ~ByteReader();
62
63	// Read a single byte from BUFFER and return it as an unsigned 8 bit
64	// number.
65	uint8_t ReadOneByte(const uint8_t* buffer) const;
66
67	// Read two bytes from BUFFER and return them as an unsigned 16 bit
68	// number, using this ByteReader's endianness.
69	uint16_t ReadTwoBytes(const uint8_t* buffer) const;
70
71	// Read three bytes from BUFFER and return them as an unsigned 64 bit
72	// number, using this ByteReader's endianness. DWARF 5 uses this encoding
73	// for various index-related DW_FORMs.
74	uint64_t ReadThreeBytes(const uint8_t* buffer) const;
75
76	// Read four bytes from BUFFER and return them as an unsigned 32 bit
77	// number, using this ByteReader's endianness. This function returns
78	// a uint64_t so that it is compatible with ReadAddress and
79	// ReadOffset. The number it returns will never be outside the range
80	// of an unsigned 32 bit integer.
81	uint64_t ReadFourBytes(const uint8_t* buffer) const;
82
83	// Read eight bytes from BUFFER and return them as an unsigned 64
84	// bit number, using this ByteReader's endianness.
85	uint64_t ReadEightBytes(const uint8_t* buffer) const;
86
87	// Read an unsigned LEB128 (Little Endian Base 128) number from
88	// BUFFER and return it as an unsigned 64 bit integer. Set LEN to
89	// the number of bytes read.
90	//
91	// The unsigned LEB128 representation of an integer N is a variable
92	// number of bytes:
93	//
94	// - If N is between 0 and 0x7f, then its unsigned LEB128
95	// representation is a single byte whose value is N.
96	//
97	// - Otherwise, its unsigned LEB128 representation is (N & 0x7f) \|
98	// 0x80, followed by the unsigned LEB128 representation of N /
99	// 128, rounded towards negative infinity.
100	//
101	// In other words, we break VALUE into groups of seven bits, put
102	// them in little-endian order, and then write them as eight-bit
103	// bytes with the high bit on all but the last.
104	uint64_t ReadUnsignedLEB128(const uint8_t* buffer, size_t* len) const;
105
106	// Read a signed LEB128 number from BUFFER and return it as an
107	// signed 64 bit integer. Set LEN to the number of bytes read.
108	//
109	// The signed LEB128 representation of an integer N is a variable
110	// number of bytes:
111	//
112	// - If N is between -0x40 and 0x3f, then its signed LEB128
113	// representation is a single byte whose value is N in two's
114	// complement.
115	//
116	// - Otherwise, its signed LEB128 representation is (N & 0x7f) \|
117	// 0x80, followed by the signed LEB128 representation of N / 128,
118	// rounded towards negative infinity.
119	//
120	// In other words, we break VALUE into groups of seven bits, put
121	// them in little-endian order, and then write them as eight-bit
122	// bytes with the high bit on all but the last.
123	int64_t ReadSignedLEB128(const uint8_t* buffer, size_t* len) const;
124
125	// Indicate that addresses on this architecture are SIZE bytes long. SIZE
126	// must be either 4 or 8. (DWARF allows addresses to be any number of
127	// bytes in length from 1 to 255, but we only support 32- and 64-bit
128	// addresses at the moment.) You must call this before using the
129	// ReadAddress member function.
130	//
131	// For data in a .debug_info section, or something that .debug_info
132	// refers to like line number or macro data, the compilation unit
133	// header's address_size field indicates the address size to use. Call
134	// frame information doesn't indicate its address size (a shortcoming of
135	// the spec); you must supply the appropriate size based on the
136	// architecture of the target machine.
137	void SetAddressSize(uint8_t size);
138
139	// Return the current address size, in bytes. This is either 4,
140	// indicating 32-bit addresses, or 8, indicating 64-bit addresses.
141	uint8_t AddressSize() const { return address_size_; }
142
143	// Read an address from BUFFER and return it as an unsigned 64 bit
144	// integer, respecting this ByteReader's endianness and address size. You
145	// must call SetAddressSize before calling this function.
146	uint64_t ReadAddress(const uint8_t* buffer) const;
147
148	// DWARF actually defines two slightly different formats: 32-bit DWARF
149	// and 64-bit DWARF. This is not* related to the size of registers or*
150	// addresses on the target machine; it refers only to the size of section
151	// offsets and data lengths appearing in the DWARF data. One only needs
152	// 64-bit DWARF when the debugging data itself is larger than 4GiB.
153	// 32-bit DWARF can handle x86_64 or PPC64 code just fine, unless the
154	// debugging data itself is very large.
155	//
156	// DWARF information identifies itself as 32-bit or 64-bit DWARF: each
157	// compilation unit and call frame information entry begins with an
158	// "initial length" field, which, in addition to giving the length of the
159	// data, also indicates the size of section offsets and lengths appearing
160	// in that data. The ReadInitialLength member function, below, reads an
161	// initial length and sets the ByteReader's offset size as a side effect.
162	// Thus, in the normal process of reading DWARF data, the appropriate
163	// offset size is set automatically. So, you should only need to call
164	// SetOffsetSize if you are using the same ByteReader to jump from the
165	// midst of one block of DWARF data into another.
166
167	// Read a DWARF "initial length" field from START, and return it as
168	// an unsigned 64 bit integer, respecting this ByteReader's
169	// endianness. Set LEN to the length of the initial length in*
170	// bytes, either four or twelve. As a side effect, set this
171	// ByteReader's offset size to either 4 (if we see a 32-bit DWARF
172	// initial length) or 8 (if we see a 64-bit DWARF initial length).
173	//
174	// A DWARF initial length is either:
175	//
176	// - a byte count stored as an unsigned 32-bit value less than
177	// 0xffffff00, indicating that the data whose length is being
178	// measured uses the 32-bit DWARF format, or
179	//
180	// - The 32-bit value 0xffffffff, followed by a 64-bit byte count,
181	// indicating that the data whose length is being measured uses
182	// the 64-bit DWARF format.
183	uint64_t ReadInitialLength(const uint8_t* start, size_t* len);
184
185	// Read an offset from BUFFER and return it as an unsigned 64 bit
186	// integer, respecting the ByteReader's endianness. In 32-bit DWARF, the
187	// offset is 4 bytes long; in 64-bit DWARF, the offset is eight bytes
188	// long. You must call ReadInitialLength or SetOffsetSize before calling
189	// this function; see the comments above for details.
190	uint64_t ReadOffset(const uint8_t* buffer) const;
191
192	// Return the current offset size, in bytes.
193	// A return value of 4 indicates that we are reading 32-bit DWARF.
194	// A return value of 8 indicates that we are reading 64-bit DWARF.
195	uint8_t OffsetSize() const { return offset_size_; }
196
197	// Indicate that section offsets and lengths are SIZE bytes long. SIZE
198	// must be either 4 (meaning 32-bit DWARF) or 8 (meaning 64-bit DWARF).
199	// Usually, you should not call this function yourself; instead, let a
200	// call to ReadInitialLength establish the data's offset size
201	// automatically.
202	void SetOffsetSize(uint8_t size);
203
204	// The Linux C++ ABI uses a variant of DWARF call frame information
205	// for exception handling. This data is included in the program's
206	// address space as the ".eh_frame" section, and intepreted at
207	// runtime to walk the stack, find exception handlers, and run
208	// cleanup code. The format is mostly the same as DWARF CFI, with
209	// some adjustments made to provide the additional
210	// exception-handling data, and to make the data easier to work with
211	// in memory --- for example, to allow it to be placed in read-only
212	// memory even when describing position-independent code.
213	//
214	// In particular, exception handling data can select a number of
215	// different encodings for pointers that appear in the data, as
216	// described by the DwarfPointerEncoding enum. There are actually
217	// four axes(!) to the encoding:
218	//
219	// - The pointer size: pointers can be 2, 4, or 8 bytes long, or use
220	// the DWARF LEB128 encoding.
221	//
222	// - The pointer's signedness: pointers can be signed or unsigned.
223	//
224	// - The pointer's base address: the data stored in the exception
225	// handling data can be the actual address (that is, an absolute
226	// pointer), or relative to one of a number of different base
227	// addreses --- including that of the encoded pointer itself, for
228	// a form of "pc-relative" addressing.
229	//
230	// - The pointer may be indirect: it may be the address where the
231	// true pointer is stored. (This is used to refer to things via
232	// global offset table entries, program linkage table entries, or
233	// other tricks used in position-independent code.)
234	//
235	// There are also two options that fall outside that matrix
236	// altogether: the pointer may be omitted, or it may have padding to
237	// align it on an appropriate address boundary. (That last option
238	// may seem like it should be just another axis, but it is not.)
239
240	// Indicate that the exception handling data is loaded starting at
241	// SECTION_BASE, and that the start of its buffer in our own memory
242	// is BUFFER_BASE. This allows us to find the address that a given
243	// byte in our buffer would have when loaded into the program the
244	// data describes. We need this to resolve DW_EH_PE_pcrel pointers.
245	void SetCFIDataBase(uint64_t section_base, const uint8_t* buffer_base);
246
247	// Indicate that the base address of the program's ".text" section
248	// is TEXT_BASE. We need this to resolve DW_EH_PE_textrel pointers.
249	void SetTextBase(uint64_t text_base);
250
251	// Indicate that the base address for DW_EH_PE_datarel pointers is
252	// DATA_BASE. The proper value depends on the ABI; it is usually the
253	// address of the global offset table, held in a designated register in
254	// position-independent code. You will need to look at the startup code
255	// for the target system to be sure. I tried; my eyes bled.
256	void SetDataBase(uint64_t data_base);
257
258	// Indicate that the base address for the FDE we are processing is
259	// FUNCTION_BASE. This is the start address of DW_EH_PE_funcrel
260	// pointers. (This encoding does not seem to be used by the GNU
261	// toolchain.)
262	void SetFunctionBase(uint64_t function_base);
263
264	// Indicate that we are no longer processing any FDE, so any use of
265	// a DW_EH_PE_funcrel encoding is an error.
266	void ClearFunctionBase();
267
268	// Return true if ENCODING is a valid pointer encoding.
269	bool ValidEncoding(DwarfPointerEncoding encoding) const;
270
271	// Return true if we have all the information we need to read a
272	// pointer that uses ENCODING. This checks that the appropriate
273	// SetFooBase function for ENCODING has been called.
274	bool UsableEncoding(DwarfPointerEncoding encoding) const;
275
276	// Read an encoded pointer from BUFFER using ENCODING; return the
277	// absolute address it represents, and set LEN to the pointer's*
278	// length in bytes, including any padding for aligned pointers.
279	//
280	// This function calls 'abort' if ENCODING is invalid or refers to a
281	// base address this reader hasn't been given, so you should check
282	// with ValidEncoding and UsableEncoding first if you would rather
283	// die in a more helpful way.
284	uint64_t ReadEncodedPointer(const uint8_t* buffer,
285	DwarfPointerEncoding encoding,
286	size_t* len) const;
287
288	Endianness GetEndianness() const;
289	private:
290
291	// Function pointer type for our address and offset readers.
292	typedef uint64_t (ByteReader::AddressReader)(const* uint8_t) const*;
293
294	// Read an offset from BUFFER and return it as an unsigned 64 bit
295	// integer. DWARF2/3 define offsets as either 4 or 8 bytes,
296	// generally depending on the amount of DWARF2/3 info present.
297	// This function pointer gets set by SetOffsetSize.
298	AddressReader offset_reader_;
299
300	// Read an address from BUFFER and return it as an unsigned 64 bit
301	// integer. DWARF2/3 allow addresses to be any size from 0-255
302	// bytes currently. Internally we support 4 and 8 byte addresses,
303	// and will CHECK on anything else.
304	// This function pointer gets set by SetAddressSize.
305	AddressReader address_reader_;
306
307	Endianness endian_;
308	uint8_t address_size_;
309	uint8_t offset_size_;
310
311	// Base addresses for Linux C++ exception handling data's encoded pointers.
312	bool have_section_base_, have_text_base_, have_data_base_;
313	bool have_function_base_;
314	uint64_t section_base_, text_base_, data_base_, function_base_;
315	const uint8_t* buffer_base_;
316	};
317
318	} // namespace google_breakpad
319
320	#endif // COMMON_DWARF_BYTEREADER_H__
321

Browse the source code of breakpad/common/dwarf/bytereader.h