1// Copyright 2015 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15////////////////////////////////////////////////////////////////////////////////
16
17#ifndef PIEX_BINARY_PARSE_RANGE_CHECKED_BYTE_PTR_H_
18#define PIEX_BINARY_PARSE_RANGE_CHECKED_BYTE_PTR_H_
19
20#include <assert.h>
21
22#include <cstddef>
23#include <memory>
24#include <string>
25#include <vector>
26
27namespace piex {
28namespace binary_parse {
29
30// Since NaCl does not comply to C++11 we can not just use stdint.h.
31typedef unsigned short uint16; // NOLINT
32typedef short int16; // NOLINT
33typedef unsigned int uint32;
34typedef int int32;
35
36enum MemoryStatus {
37 RANGE_CHECKED_BYTE_SUCCESS = 0,
38 RANGE_CHECKED_BYTE_ERROR = 1,
39 RANGE_CHECKED_BYTE_ERROR_OVERFLOW = 2,
40 RANGE_CHECKED_BYTE_ERROR_UNDERFLOW = 3,
41};
42
43// Interface that RangeCheckedBytePtr uses to access the underlying array of
44// bytes. This allows RangeCheckedBytePtr to be used to access data as if it
45// were stored contiguously in memory, even if the data is in fact split up
46// into non-contiguous chunks and / or does not reside in memory.
47//
48// The only requirement is that the data can be read in pages of a fixed (but
49// configurable) size. Notionally, the byte array (which contains length()
50// bytes) is split up into non-overlapping pages of pageSize() bytes each.
51// (The last page may be shorter if length() is not a multiple of pageSize().)
52// There are therefore (length() - 1) / pageSize() + 1 such pages, with indexes
53// 0 through (length() - 1) / pageSize(). Page i contains the bytes from offset
54// i * pageSize() in the array up to and including the byte at offset
55// (i + 1) * pageSize() - 1 (or, in the case of the last page, length() - 1).
56//
57// In essence, RangeCheckedBytePtr and PagedByteArray together provide a poor
58// man's virtual-memory-and-memory-mapped-file work-alike in situations where
59// virtual memory cannot be used or would consume too much virtual address
60// space.
61//
62// Thread safety: In general, subclasses implementing this interface should
63// ensure that the member functions are thread-safe. It will then be safe to
64// access the same array from multiple threads. (Note that RangeCheckedBytePtr
65// itself is not thread-safe in the sense that a single instance of
66// RangeCheckedBytePtr cannot be used concurrently from multiple threads; it
67// is, however, safe to use different RangeCheckedBytePtr instances in
68// different threads to access the same PagedByteArray concurrently, assuming
69// that the PagedByteArray implementation is thread-safe.)
70class PagedByteArray {
71 public:
72 // Base class for pages in the byte array. Implementations of PagedByteArray
73 // can create a subclass of the Page class to manage the lifetime of buffers
74 // associated with a page returned by getPage(). For example, a
75 // PagedByteArray backed by a file might define a Page subclass like this:
76 //
77 // class FilePage : public Page {
78 // std::vector<unsigned char> bytes;
79 // };
80 //
81 // The corresponding getPage() implementation could then look like this:
82 //
83 // void getPage(size_t page_index, const unsigned char** begin,
84 // const unsigned char** end, std::shared_ptr<Page>* page)
85 // {
86 // // Create a new page.
87 // std::shared_ptr<FilePage> file_page(new FilePage());
88 //
89 // // Read contents of page from file into file_page->bytes.
90 // [...]
91 //
92 // // Set *begin and *end to point to beginning and end of
93 // // file_page->bytes vector.
94 // *begin = &file_page->bytes[0];
95 // *end = *begin + file_page->bytes.size();
96 //
97 // // Return page to caller
98 // *page = file_page;
99 // }
100 //
101 // In this way, the storage associated with the page (the FilePage::bytes
102 // vector) will be kept alive until the RangeCheckedBytePtr releases the
103 // shared pointer.
104 class Page {};
105
106 typedef std::shared_ptr<Page> PagePtr;
107
108 virtual ~PagedByteArray();
109
110 // Returns the length of the array in bytes. The value returned must remain
111 // the same on every call for the entire lifetime of the object.
112 virtual size_t length() const = 0;
113
114 // Returns the length of each page in bytes. (The last page may be shorter
115 // than pageSize() if length() is not a multiple of pageSize() -- see also
116 // the class-wide comment above.) The value returned must remain the same on
117 // every call for the entire lifetime of the object.
118 virtual size_t pageSize() const = 0;
119
120 // Returns a pointer to a memory buffer containing the data for the page
121 // with index "page_index".
122 //
123 // *begin is set to point to the first byte of the page; *end is set to point
124 // one byte beyond the last byte in the page. This implies that:
125 // - (*end - *begin) == pageSize() for every page except the last page
126 // - (*end - *begin) == length() - pageSize() * ((length() - 1) / pageSize())
127 // for the last page
128 //
129 // *page will be set to a SharedPtr that the caller will hold on to until
130 // it no longer needs to access the memory buffer. The memory buffer will
131 // remain valid until the SharedPtr is released or the PagedByteArray object
132 // is destroyed. An implementation may choose to return a null SharedPtr;
133 // this indicates that the memory buffer will remain valid until the
134 // PagedByteArray object is destroyed, even if the caller does not hold on to
135 // the SharedPtr. (This is intended as an optimization that some
136 // implementations may choose to take advantage of, as a null SharedPtr is
137 // cheaper to copy.)
138 virtual void getPage(size_t page_index, const unsigned char **begin,
139 const unsigned char **end, PagePtr *page) const = 0;
140};
141
142typedef std::shared_ptr<PagedByteArray> PagedByteArrayPtr;
143
144// Smart pointer that has the same semantics as a "const unsigned char *" (plus
145// some convenience functions) but provides range checking and the ability to
146// access arrays that are not contiguous in memory or do not reside entirely in
147// memory (through the PagedByteArray interface).
148//
149// In the following, we abbreviate RangeCheckedBytePtr as RCBP.
150//
151// The intent of this class is to allow easy security hardening of code that
152// parses binary data structures using raw byte pointers. To do this, only the
153// declarations of the pointers need to be changed; the code that uses the
154// pointers can remain unchanged.
155//
156// If an illegal operation occurs on a pointer, an error flag is set, and all
157// read operations from this point on return 0. This means that error checking
158// need not be done after every access; it is sufficient to check the error flag
159// (using errorOccurred()) once before the RCBP is destroyed. Again, this allows
160// the majority of the parsing code to remain unchanged. (Note caveats below
161// that apply if a copy of the pointer is created.)
162//
163// Legal operations are exactly the ones that would be legal on a raw C++
164// pointer. Read accesses are legal if they fall within the underlying array. A
165// RCBP may point to any element in the underlying array or one element beyond
166// the end of the array.
167//
168// For brevity, the documentation for individual member functions does not state
169// explicitly that the error flag will be set on out-of-range operations.
170//
171// Note:
172//
173// - Just as for raw pointers, it is legal for a pointer to point one element
174// beyond the end of the array, but it is illegal to use operator*() on such a
175// pointer.
176//
177// - If a copy of an RCBP is created, then performing illegal operations on the
178// copy affects the error flag of the copy, but not of the original pointer.
179// Note that using operator+ and operator- also creates a copy of the pointer.
180// For example:
181//
182// // Assume we have an RCBP called "p" and a size_t variable called
183// // "offset".
184// RangeCheckedBytePtr sub_data_structure = p + offset;
185//
186// If "offset" is large enough to cause an out-of-range access, then
187// sub_data_structure.errorOccurred() will be true, but p.errorOccurred() will
188// still be false. The error flag for sub_data_structure therefore needs to be
189// checked before it is destroyed.
190class RangeCheckedBytePtr {
191 private:
192 // This class maintains the following class invariants:
193 // - page_data_ always points to a buffer of at least current_page_len_
194 // bytes.
195 //
196 // - The current position lies within the sub-array, i.e.
197 // sub_array_begin_ <= current_pos_ <= sub_array_end_
198 //
199 // - The sub-array is entirely contained within the array, i.e.
200 // 0 <= sub_array_begin <= sub_array_end <= array_->length()
201 //
202 // - If the current page is non-empty, it lies completely within the
203 // sub-array, i.e.
204 // if _current_page_len_ >= 0, then
205 // sub_array_begin_ <= page_begin_offset_
206 // and
207 // page_begin_offset_ + current_page_len_ <= sub_array_end_
208 // (See also restrictPageToSubArray().)
209 // (If _current_page_len_ == 0, then page_begin_offset_ may lie outside
210 // the sub-array; this condition is harmless. Additional logic would be
211 // required to make page_begin_offset_ lie within the sub-array in this
212 // case, and it would serve no purpose other than to make the invariant
213 // slightly simpler.)
214 //
215 // Note that it is _not_ a class invariant that current_pos_ needs to lie
216 // within the current page. Making this an invariant would have two
217 // undesirable consequences:
218 // a) When operator[] is called with an index that lies beyond the end of
219 // the current page, it would need to temporarily load the page that
220 // contains this index, but it wouldn't be able to "retain" the page
221 // (i.e. make it the current page) because that would violate the
222 // proposed invariant. This would lead to inefficient behavior in the
223 // case where code accesses a large range of indices beyond the end of
224 // the page because a page would need to be loaded temporarily on each
225 // access.
226 // b) It would require more code: loadPageForOffset() would need to be
227 // called anywhere that current_pos_ changes (whereas, with the present
228 // approach, loadPageForOffset() is only called in operator[]).
229
230 // PagedByteArray that is accessed by this pointer.
231 PagedByteArrayPtr array_;
232
233 // Pointer to the current page.
234 mutable PagedByteArray::PagePtr page_;
235
236 // Pointer to the current page's data buffer.
237 mutable const unsigned char *page_data_;
238
239 // All of the following offsets are defined relative to the beginning of
240 // the array defined by the PagedByteArray array_.
241
242 // Array offset that the pointer points to.
243 size_t current_pos_;
244
245 // Start offset of the current sub-array.
246 size_t sub_array_begin_;
247
248 // End offset of the current sub-array.
249 size_t sub_array_end_;
250
251 // Array offset corresponding to the "page_data_" pointer.
252 mutable size_t page_begin_offset_;
253
254 // Length of the current page.
255 mutable size_t current_page_len_;
256
257 // Error flag. This is mutable because methods that don't affect the value
258 // of the pointer itself (such as operator[]) nevertheless need to be able to
259 // signal error conditions.
260 mutable MemoryStatus error_flag_;
261
262 RangeCheckedBytePtr();
263
264 public:
265 // Creates a pointer that points to the first element of 'array', which has a
266 // length of 'len'. The caller must ensure that the array remains valid until
267 // this pointer and any pointers created from it have been destroyed.
268 // Note: 'len' may be zero, but 'array' must in this case still be a valid,
269 // non-null pointer.
270 explicit RangeCheckedBytePtr(const unsigned char *array, const size_t len);
271
272 // Creates a pointer that points to the first element of the given
273 // PagedByteArray. The caller must ensure that this PagedByteArray remains
274 // valid until this pointer and any pointers created from it have been
275 // destroyed.
276 explicit RangeCheckedBytePtr(PagedByteArray *array);
277
278 // Creates an invalid RangeCheckedBytePtr. Calling errorOccurred() on the
279 // result of invalidPointer() always returns true.
280 // Do not check a RangeCheckedBytePtr for validity by comparing against
281 // invalidPointer(); use errorOccurred() instead.
282 static RangeCheckedBytePtr invalidPointer();
283
284 // Returns a RangeCheckedBytePtr that points to a sub-array of this pointer's
285 // underlying array. The sub-array starts at position 'pos' relative to this
286 // pointer and is 'length' bytes long. The sub-array must lie within this
287 // pointer's array, i.e. pos + length <= remainingLength() must hold. If this
288 // condition is violated, an invalid pointer is returned.
289 RangeCheckedBytePtr pointerToSubArray(size_t pos, size_t length) const;
290
291 // Returns the number of bytes remaining in the array from this pointer's
292 // present position.
293 inline size_t remainingLength() const;
294
295 // Returns the offset (or index) in the underlying array that this pointer
296 // points to. If this pointer was created using pointerToSubArray(), the
297 // offset is relative to the beginning of the sub-array (and not relative to
298 // the beginning of the original array).
299 size_t offsetInArray() const;
300
301 // Returns whether an out-of-bounds error has ever occurred on this pointer in
302 // the past. An error occurs if a caller attempts to read from a position
303 // outside the bounds of the array or to move the pointer outside the bounds
304 // of the array.
305 //
306 // The error flag is never reset. Once an error has occurred,
307 // all subsequent attempts to read from the pointer (even within the bounds of
308 // the array) return 0.
309 //
310 // Note that it is permissible for a pointer to point one element past the end
311 // of the array, but it is not permissible to read from this position. This is
312 // equivalent to the semantics of raw C++ pointers.
313 inline bool errorOccurred() const;
314
315 // Returns the substring of length 'length' located at position 'pos' relative
316 // to this pointer.
317 std::string substr(size_t pos, size_t length) const;
318
319 // Returns 'length' number of bytes from the array starting at position 'pos'
320 // relative to this pointer.
321 std::vector<unsigned char> extractBytes(size_t pos, size_t length) const;
322
323 // Equivalent to calling convert(0, output).
324 template <class T>
325 bool convert(T *output) const {
326 union {
327 T t;
328 unsigned char ch[sizeof(T)];
329 } buffer;
330 for (size_t i = 0; i < sizeof(T); i++) {
331 buffer.ch[i] = (*this)[i];
332 }
333 if (!errorOccurred()) {
334 *output = buffer.t;
335 }
336 return !errorOccurred();
337 }
338
339 // Reinterprets this pointer as a pointer to an array of T, then returns the
340 // element at position 'index' in this array of T. (Note that this position
341 // corresponds to position index * sizeof(T) in the underlying byte array.)
342 //
343 // Returns true if successful; false if an out-of-range error occurred or if
344 // the error flag was already set on the pointer when calling convert().
345 //
346 // The conversion from a sequence of sizeof(T) bytes to a T is performed in an
347 // implementation-defined fashion. This conversion is equivalent to the one
348 // obtained using the following union by filling the array 'ch' and then
349 // reading the member 't':
350 //
351 // union {
352 // T t;
353 // unsigned char ch[sizeof(T)];
354 // };
355 //
356 // Callers should note that, among other things, the conversion is not
357 // endian-agnostic with respect to the endianness of T.
358 template <class T>
359 bool convert(size_t index, T *output) const {
360 RangeCheckedBytePtr p = (*this) + index * sizeof(T);
361 bool valid = p.convert(output);
362 if (!valid) {
363 error_flag_ = p.error_flag_;
364 }
365 return valid;
366 }
367
368 // Operators. Unless otherwise noted, these operators have the same semantics
369 // as the same operators on an unsigned char pointer.
370
371 // If an out-of-range access is attempted, returns 0 (and sets the error
372 // flag).
373 inline unsigned char operator[](size_t i) const;
374
375 inline unsigned char operator*() const;
376
377 inline RangeCheckedBytePtr &operator++();
378
379 inline RangeCheckedBytePtr operator++(int);
380
381 inline RangeCheckedBytePtr &operator--();
382
383 inline RangeCheckedBytePtr operator--(int);
384
385 inline RangeCheckedBytePtr &operator+=(size_t x);
386
387 inline RangeCheckedBytePtr &operator-=(size_t x);
388
389 inline friend RangeCheckedBytePtr operator+(const RangeCheckedBytePtr &p,
390 size_t x);
391
392 inline friend RangeCheckedBytePtr operator-(const RangeCheckedBytePtr &p,
393 size_t x);
394
395 // Tests whether x and y point at the same position in the underlying array.
396 // Two pointers that point at the same position but have different
397 // sub-arrays still compare equal. It is not legal to compare two pointers
398 // that point into different paged byte arrays.
399 friend bool operator==(const RangeCheckedBytePtr &x,
400 const RangeCheckedBytePtr &y);
401
402 // Returns !(x == y).
403 friend bool operator!=(const RangeCheckedBytePtr &x,
404 const RangeCheckedBytePtr &y);
405
406 private:
407 void loadPageForOffset(size_t offset) const;
408 void restrictPageToSubArray() const;
409};
410
411// Returns the result of calling std::memcmp() on the sequences of 'num' bytes
412// pointed to by 'x' and 'y'. The result is undefined if either
413// x.remainingLength() or y.remainingLength() is less than 'num'.
414int memcmp(const RangeCheckedBytePtr &x, const RangeCheckedBytePtr &y,
415 size_t num);
416
417// Returns the result of calling std::memcmp() (note: _not_ strcmp()) on the
418// y.length() number of bytes pointed to by 'x' and the string 'y'. The result
419// is undefined if x.remainingLength() is less than y.length().
420int strcmp(const RangeCheckedBytePtr &x, const std::string &y);
421
422// Returns the length of the zero-terminated string starting at 'src' (not
423// including the '\0' terminator). If no '\0' occurs before the end of the
424// array, the result is undefined.
425size_t strlen(const RangeCheckedBytePtr &src);
426
427// Integer decoding functions.
428//
429// These functions read signed (Get16s, Get32s) or unsigned (Get16u, Get32u)
430// integers from 'input'. The integer read from the input can be specified to be
431// either big-endian (big_endian == true) or little-endian
432// (little_endian == false). Signed integers are read in two's-complement
433// representation. The integer read in the specified format is then converted to
434// the implementation's native integer representation and returned. In other
435// words, the semantics of these functions are independent of the
436// implementation's endianness and signed integer representation.
437//
438// If an out-of-range error occurs, these functions do _not_ set the error flag
439// on 'input'. Instead, they set 'status' to RANGE_CHECKED_BYTE_ERROR and return
440// 0.
441//
442// Note:
443// - If an error occurs and 'status' is already set to an error value (i.e. a
444// value different from RANGE_CHECKED_BYTE_SUCCESS), the value of 'status' is
445// left unchanged.
446// - If the operation is successful, 'status' is left unchanged (i.e. it is not
447// actively set to RANGE_CHECKED_BYTE_SUCCESS).
448//
449// Together, these two properties mean that these functions can be used to read
450// a number of integers in succession with only a single error check, like this:
451//
452// MemoryStatus status = RANGE_CHECKED_BYTE_SUCCESS;
453// int16 val1 = Get16s(input, false, &status);
454// int32 val2 = Get32s(input + 2, false, &status);
455// uint32 val3 = Get32u(input + 6, false, &status);
456// if (status != RANGE_CHECKED_BYTE_SUCCESS) {
457// // error handling
458// }
459int16 Get16s(const RangeCheckedBytePtr &input, const bool big_endian,
460 MemoryStatus *status);
461uint16 Get16u(const RangeCheckedBytePtr &input, const bool big_endian,
462 MemoryStatus *status);
463int32 Get32s(const RangeCheckedBytePtr &input, const bool big_endian,
464 MemoryStatus *status);
465uint32 Get32u(const RangeCheckedBytePtr &input, const bool big_endian,
466 MemoryStatus *status);
467
468size_t RangeCheckedBytePtr::remainingLength() const {
469 if (!errorOccurred()) {
470 // current_pos_ <= sub_array_end_ is a class invariant, but protect
471 // against violations of this invariant.
472 if (current_pos_ <= sub_array_end_) {
473 return sub_array_end_ - current_pos_;
474 } else {
475 assert(false);
476 return 0;
477 }
478 } else {
479 return 0;
480 }
481}
482
483bool RangeCheckedBytePtr::errorOccurred() const {
484 return error_flag_ != RANGE_CHECKED_BYTE_SUCCESS;
485}
486
487unsigned char RangeCheckedBytePtr::operator[](size_t i) const {
488 // Check that pointer doesn't have an error flag set.
489 if (!errorOccurred()) {
490 // Offset in array to read from.
491 const size_t read_offset = current_pos_ + i;
492
493 // Check for the common case first: The byte we want to read lies in the
494 // current page. For performance reasons, we don't check for the case
495 // "read_offset < page_begin_offset_" explicitly; if it occurs, it will
496 // lead to wraparound (which is well-defined for unsigned quantities), and
497 // this will cause the test "pos_in_page < current_page_len_" to fail.
498 size_t pos_in_page = read_offset - page_begin_offset_;
499 if (pos_in_page < current_page_len_) {
500 return page_data_[pos_in_page];
501 }
502
503 // Check that the offset we're trying to read lies within the sub-array
504 // we're allowed to access.
505 if (read_offset >= sub_array_begin_ && read_offset < sub_array_end_) {
506 // Read the page that contains the offset "read_offset".
507 loadPageForOffset(read_offset);
508
509 // Compute the position within the new page from which we need to read.
510 pos_in_page = read_offset - page_begin_offset_;
511
512 // After the call to loadPageForOffset(), read_offset must lie within
513 // the current page, and therefore pos_in_page must be less than the
514 // length of the page. We nevertheless check for this to protect against
515 // potential bugs in loadPageForOffset().
516 assert(pos_in_page < current_page_len_);
517 if (pos_in_page < current_page_len_) {
518 return page_data_[pos_in_page];
519 }
520 }
521 }
522
523// All error cases fall through to here.
524#ifdef BREAK_IF_DEBUGGING_AND_OUT_OF_RANGE
525 assert(false);
526#endif
527 error_flag_ = RANGE_CHECKED_BYTE_ERROR_OVERFLOW;
528 // return 0, which represents the invalid value
529 return static_cast<unsigned char>(0);
530}
531
532unsigned char RangeCheckedBytePtr::operator*() const { return (*this)[0]; }
533
534RangeCheckedBytePtr &RangeCheckedBytePtr::operator++() {
535 if (current_pos_ < sub_array_end_) {
536 current_pos_++;
537 } else {
538#ifdef BREAK_IF_DEBUGGING_AND_OUT_OF_RANGE
539 assert(false);
540#endif
541 error_flag_ = RANGE_CHECKED_BYTE_ERROR_OVERFLOW;
542 }
543 return *this;
544}
545
546RangeCheckedBytePtr RangeCheckedBytePtr::operator++(int) {
547 RangeCheckedBytePtr result(*this);
548 ++(*this);
549 return result;
550}
551
552RangeCheckedBytePtr &RangeCheckedBytePtr::operator--() {
553 if (current_pos_ > sub_array_begin_) {
554 current_pos_--;
555 } else {
556#ifdef BREAK_IF_DEBUGGING_AND_OUT_OF_RANGE
557 assert(false);
558#endif
559 error_flag_ = RANGE_CHECKED_BYTE_ERROR_UNDERFLOW;
560 }
561 return *this;
562}
563
564RangeCheckedBytePtr RangeCheckedBytePtr::operator--(int) {
565 RangeCheckedBytePtr result(*this);
566 --(*this);
567 return result;
568}
569
570RangeCheckedBytePtr &RangeCheckedBytePtr::operator+=(size_t x) {
571 if (remainingLength() >= x) {
572 current_pos_ += x;
573 } else {
574#ifdef BREAK_IF_DEBUGGING_AND_OUT_OF_RANGE
575 assert(false);
576#endif
577 error_flag_ = RANGE_CHECKED_BYTE_ERROR_OVERFLOW;
578 }
579 return *this;
580}
581
582RangeCheckedBytePtr &RangeCheckedBytePtr::operator-=(size_t x) {
583 if (x <= current_pos_ - sub_array_begin_) {
584 current_pos_ -= x;
585 } else {
586#ifdef BREAK_IF_DEBUGGING_AND_OUT_OF_RANGE
587 assert(false);
588#endif
589 error_flag_ = RANGE_CHECKED_BYTE_ERROR_UNDERFLOW;
590 }
591 return *this;
592}
593
594RangeCheckedBytePtr operator+(const RangeCheckedBytePtr &p, size_t x) {
595 RangeCheckedBytePtr result(p);
596 result += x;
597 return result;
598}
599
600RangeCheckedBytePtr operator-(const RangeCheckedBytePtr &p, size_t x) {
601 RangeCheckedBytePtr result(p);
602 result -= x;
603 return result;
604}
605
606} // namespace binary_parse
607} // namespace piex
608
609#endif // PIEX_BINARY_PARSE_RANGE_CHECKED_BYTE_PTR_H_
610