1 | // Copyright 2001 and onwards Google Inc. |
2 | // |
3 | // Raw support for varint encoding. Higher level interfaces are |
4 | // provided by Encoder/Decoder/IOBuffer. Clients should typically use |
5 | // those interfaces, unless speed is paramount. |
6 | // |
7 | // If decoding speed is very important, consider using PrefixVarint instead. |
8 | // It has the same compression ratio, but generally faster decoding. |
9 | // |
10 | // Provided routines: |
11 | // vi_parse32_unchecked |
12 | // vi_parse64_unchecked |
13 | // vi_encode32_unchecked |
14 | // vi_encode64_unchecked |
15 | |
16 | #ifndef _VARINT_H |
17 | #define _VARINT_H |
18 | |
19 | #include <string> |
20 | using std::string; |
21 | |
22 | #include "base/basictypes.h" |
23 | |
24 | // Just a namespace, not a real class |
25 | class Varint { |
26 | public: |
27 | // Maximum lengths of varint encoding of uint32 and uint64 |
28 | static const int kMax32 = 5; |
29 | static const int kMax64 = 10; |
30 | |
31 | // REQUIRES "ptr" points to a buffer of length at least kMaxXX |
32 | // EFFECTS Scan next varint from "ptr" and store in OUTPUT. |
33 | // Returns pointer just past last read byte. Returns |
34 | // NULL if a valid varint value was not found. |
35 | static const char* Parse32(const char* ptr, uint32* OUTPUT); |
36 | static const char* Parse64(const char* ptr, uint64* OUTPUT); |
37 | |
38 | // A fully inlined version of Parse32: useful in the most time critical |
39 | // routines, but its code size is large |
40 | static const char* Parse32Inline(const char* ptr, uint32* OUTPUT); |
41 | |
42 | // REQUIRES "ptr" points just past the last byte of a varint-encoded value. |
43 | // REQUIRES A second varint must be encoded just before the one we parse, |
44 | // OR "base" must point to the first byte of the one we parse. |
45 | // REQUIRES Bytes [base, ptr-1] are readable |
46 | // |
47 | // EFFECTS Scan backwards from "ptr" and store in OUTPUT. Stop at the last |
48 | // byte of the previous varint, OR at "base", whichever one comes |
49 | // first. Returns pointer to the first byte of the decoded varint |
50 | // NULL if a valid varint value was not found. |
51 | static const char* Parse32Backward(const char* ptr, const char* base, |
52 | uint32* OUTPUT); |
53 | static const char* Parse64Backward(const char* ptr, const char* base, |
54 | uint64* OUTPUT); |
55 | |
56 | // Attempts to parse a varint32 from a prefix of the bytes in [ptr,limit-1]. |
57 | // Never reads a character at or beyond limit. If a valid/terminated varint32 |
58 | // was found in the range, stores it in *OUTPUT and returns a pointer just |
59 | // past the last byte of the varint32. Else returns NULL. On success, |
60 | // "result <= limit". |
61 | static const char* Parse32WithLimit(const char* ptr, const char* limit, |
62 | uint32* OUTPUT); |
63 | static const char* Parse64WithLimit(const char* ptr, const char* limit, |
64 | uint64* OUTPUT); |
65 | |
66 | // REQUIRES "ptr" points to the first byte of a varint-encoded value. |
67 | // EFFECTS Scans until the end of the varint and returns a pointer just |
68 | // past the last byte. Returns NULL if "ptr" does not point to |
69 | // a valid varint value. |
70 | static const char* Skip32(const char* ptr); |
71 | static const char* Skip64(const char* ptr); |
72 | |
73 | // REQUIRES "ptr" points just past the last byte of a varint-encoded value. |
74 | // REQUIRES A second varint must be encoded just before the one we parse, |
75 | // OR "base" must point to the first byte of the one we parse. |
76 | // REQUIRES Bytes [base, ptr-1] are readable |
77 | // |
78 | // EFFECTS Scan backwards from "ptr" and stop at the last byte of the |
79 | // previous varint, OR at "base", whichever one comes first. |
80 | // Returns pointer to the first byte of the skipped varint or |
81 | // NULL if a valid varint value was not found. |
82 | static const char* Skip32Backward(const char* ptr, const char* base); |
83 | static const char* Skip64Backward(const char* ptr, const char* base); |
84 | |
85 | // REQUIRES "ptr" points to a buffer of length sufficient to hold "v". |
86 | // EFFECTS Encodes "v" into "ptr" and returns a pointer to the |
87 | // byte just past the last encoded byte. |
88 | static char* Encode32(char* ptr, uint32 v); |
89 | static char* Encode64(char* ptr, uint64 v); |
90 | |
91 | // A fully inlined version of Encode32: useful in the most time critical |
92 | // routines, but its code size is large |
93 | static char* Encode32Inline(char* ptr, uint32 v); |
94 | |
95 | // EFFECTS Returns the encoding length of the specified value. |
96 | static int Length32(uint32 v); |
97 | static int Length64(uint64 v); |
98 | |
99 | static int Length32NonInline(uint32 v); |
100 | |
101 | // EFFECTS Appends the varint representation of "value" to "*s". |
102 | static void Append32(string* s, uint32 value); |
103 | static void Append64(string* s, uint64 value); |
104 | |
105 | // EFFECTS Encodes a pair of values to "*s". The encoding |
106 | // is done by weaving together 4 bit groups of |
107 | // each number into a single 64 bit value, and then |
108 | // encoding this value as a Varint64 value. This means |
109 | // that if both a and b are small, both values can be |
110 | // encoded in a single byte. |
111 | static void EncodeTwo32Values(string* s, uint32 a, uint32 b); |
112 | static const char* DecodeTwo32Values(const char* ptr, uint32* a, uint32* b); |
113 | |
114 | // Decode and sum up a sequence of deltas until the sum >= goal. |
115 | // It is significantly faster than calling ParseXXInline in a loop. |
116 | // NOTE(user): The code does NO error checking, it assumes all the |
117 | // deltas are valid and the sum of deltas will never exceed kint64max. The |
118 | // code works for both 32bits and 64bits varint, and on 64 bits machines, |
119 | // the 64 bits version is almost always faster. Thus we only have a 64 bits |
120 | // interface here. The interface is slightly different from the other |
121 | // functions in that it requires *signed* integers. |
122 | // REQUIRES "ptr" points to the first byte of a varint-encoded delta. |
123 | // The sum of deltas >= goal (the code does NO boundary check). |
124 | // goal is positive and fit into a signed int64. |
125 | // EFFECTS Returns a pointer just past last read byte. |
126 | // "out" stores the actual sum. |
127 | static const char* FastDecodeDeltas(const char* ptr, int64 goal, int64* out); |
128 | |
129 | private: |
130 | static const char* Parse32FallbackInline(const char* p, uint32* val); |
131 | static const char* Parse32Fallback(const char* p, uint32* val); |
132 | static const char* Parse64Fallback(const char* p, uint64* val); |
133 | |
134 | static char* Encode32Fallback(char* ptr, uint32 v); |
135 | |
136 | static const char* DecodeTwo32ValuesSlow(const char* p, uint32* a, uint32* b); |
137 | static const char* Parse32BackwardSlow(const char* ptr, const char* base, |
138 | uint32* OUTPUT); |
139 | static const char* Parse64BackwardSlow(const char* ptr, const char* base, |
140 | uint64* OUTPUT); |
141 | static const char* Skip32BackwardSlow(const char* ptr, const char* base); |
142 | static const char* Skip64BackwardSlow(const char* ptr, const char* base); |
143 | |
144 | static void Append32Slow(string* s, uint32 value); |
145 | static void Append64Slow(string* s, uint64 value); |
146 | |
147 | // Mapping from rightmost bit set to the number of bytes required |
148 | static const char length32_bytes_required[33]; |
149 | }; |
150 | |
151 | /***** Implementation details; clients should ignore *****/ |
152 | |
153 | inline const char* Varint::Parse32FallbackInline(const char* p, |
154 | uint32* OUTPUT) { |
155 | // Fast path |
156 | const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p); |
157 | uint32 byte, result; |
158 | byte = *(ptr++); result = byte & 127; |
159 | assert(byte >= 128); // Already checked in inlined prelude |
160 | byte = *(ptr++); result |= (byte & 127) << 7; if (byte < 128) goto done; |
161 | byte = *(ptr++); result |= (byte & 127) << 14; if (byte < 128) goto done; |
162 | byte = *(ptr++); result |= (byte & 127) << 21; if (byte < 128) goto done; |
163 | byte = *(ptr++); result |= (byte & 127) << 28; if (byte < 128) goto done; |
164 | return NULL; // Value is too long to be a varint32 |
165 | done: |
166 | *OUTPUT = result; |
167 | return reinterpret_cast<const char*>(ptr); |
168 | } |
169 | |
170 | inline const char* Varint::Parse32(const char* p, uint32* OUTPUT) { |
171 | // Fast path for inlining |
172 | const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p); |
173 | uint32 byte = *ptr; |
174 | if (byte < 128) { |
175 | *OUTPUT = byte; |
176 | return reinterpret_cast<const char*>(ptr) + 1; |
177 | } else { |
178 | return Parse32Fallback(p, OUTPUT); |
179 | } |
180 | } |
181 | |
182 | inline const char* Varint::Parse32Inline(const char* p, uint32* OUTPUT) { |
183 | // Fast path for inlining |
184 | const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p); |
185 | uint32 byte = *ptr; |
186 | if (byte < 128) { |
187 | *OUTPUT = byte; |
188 | return reinterpret_cast<const char*>(ptr) + 1; |
189 | } else { |
190 | return Parse32FallbackInline(p, OUTPUT); |
191 | } |
192 | } |
193 | |
194 | inline const char* Varint::Skip32(const char* p) { |
195 | const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p); |
196 | if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr); |
197 | if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr); |
198 | if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr); |
199 | if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr); |
200 | if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr); |
201 | return NULL; // value is too long to be a varint32 |
202 | } |
203 | |
204 | inline const char* Varint::Parse32Backward(const char* p, const char* base, |
205 | uint32* OUTPUT) { |
206 | if (p > base + kMax32) { |
207 | // Fast path |
208 | const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p); |
209 | uint32 byte, result; |
210 | byte = *(--ptr); if (byte > 127) return NULL; |
211 | result = byte; |
212 | byte = *(--ptr); if (byte < 128) goto done; |
213 | result <<= 7; result |= (byte & 127); |
214 | byte = *(--ptr); if (byte < 128) goto done; |
215 | result <<= 7; result |= (byte & 127); |
216 | byte = *(--ptr); if (byte < 128) goto done; |
217 | result <<= 7; result |= (byte & 127); |
218 | byte = *(--ptr); if (byte < 128) goto done; |
219 | result <<= 7; result |= (byte & 127); |
220 | byte = *(--ptr); if (byte < 128) goto done; |
221 | return NULL; // Value is too long to be a varint32 |
222 | done: |
223 | *OUTPUT = result; |
224 | return reinterpret_cast<const char*>(ptr+1); |
225 | } else { |
226 | return Parse32BackwardSlow(p, base, OUTPUT); |
227 | } |
228 | } |
229 | |
230 | inline const char* Varint::Skip32Backward(const char* p, const char* base) { |
231 | if (p > base + kMax32) { |
232 | const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p); |
233 | if (*(--ptr) > 127) return NULL; |
234 | if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1); |
235 | if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1); |
236 | if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1); |
237 | if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1); |
238 | if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1); |
239 | return NULL; // value is too long to be a varint32 |
240 | } else { |
241 | return Skip32BackwardSlow(p, base); |
242 | } |
243 | } |
244 | |
245 | inline const char* Varint::Parse32WithLimit(const char* p, |
246 | const char* l, |
247 | uint32* OUTPUT) { |
248 | if (p + kMax32 <= l) { |
249 | return Parse32(p, OUTPUT); |
250 | } else { |
251 | // Slow version with bounds checks |
252 | const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p); |
253 | const unsigned char* limit = reinterpret_cast<const unsigned char*>(l); |
254 | uint32 b, result; |
255 | if (ptr >= limit) return NULL; |
256 | b = *(ptr++); result = b & 127; if (b < 128) goto done; |
257 | if (ptr >= limit) return NULL; |
258 | b = *(ptr++); result |= (b & 127) << 7; if (b < 128) goto done; |
259 | if (ptr >= limit) return NULL; |
260 | b = *(ptr++); result |= (b & 127) << 14; if (b < 128) goto done; |
261 | if (ptr >= limit) return NULL; |
262 | b = *(ptr++); result |= (b & 127) << 21; if (b < 128) goto done; |
263 | if (ptr >= limit) return NULL; |
264 | b = *(ptr++); result |= (b & 127) << 28; if (b < 16) goto done; |
265 | return NULL; // Value is too long to be a varint32 |
266 | done: |
267 | *OUTPUT = result; |
268 | return reinterpret_cast<const char*>(ptr); |
269 | } |
270 | |
271 | } |
272 | |
273 | inline const char* Varint::Parse64(const char* p, uint64* OUTPUT) { |
274 | const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p); |
275 | uint32 byte = *ptr; |
276 | if (byte < 128) { |
277 | *OUTPUT = byte; |
278 | return reinterpret_cast<const char*>(ptr) + 1; |
279 | } else { |
280 | return Parse64Fallback(p, OUTPUT); |
281 | } |
282 | } |
283 | |
284 | inline const char* Varint::Skip64(const char* p) { |
285 | const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p); |
286 | if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr); |
287 | if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr); |
288 | if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr); |
289 | if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr); |
290 | if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr); |
291 | if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr); |
292 | if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr); |
293 | if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr); |
294 | if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr); |
295 | if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr); |
296 | return NULL; // value is too long to be a varint64 |
297 | } |
298 | |
299 | inline const char* Varint::Parse64Backward(const char* p, const char* b, |
300 | uint64* OUTPUT) { |
301 | if (p > b + kMax64) { |
302 | // Fast path |
303 | const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p); |
304 | uint32 byte; |
305 | uint64 res; |
306 | |
307 | byte = *(--ptr); if (byte > 127) return NULL; |
308 | |
309 | res = byte; |
310 | byte = *(--ptr); if (byte < 128) goto done; |
311 | res <<= 7; res |= (byte & 127); |
312 | byte = *(--ptr); if (byte < 128) goto done; |
313 | res <<= 7; res |= (byte & 127); |
314 | byte = *(--ptr); if (byte < 128) goto done; |
315 | res <<= 7; res |= (byte & 127); |
316 | byte = *(--ptr); if (byte < 128) goto done; |
317 | res <<= 7; res |= (byte & 127); |
318 | byte = *(--ptr); if (byte < 128) goto done; |
319 | res <<= 7; res |= (byte & 127); |
320 | byte = *(--ptr); if (byte < 128) goto done; |
321 | res <<= 7; res |= (byte & 127); |
322 | byte = *(--ptr); if (byte < 128) goto done; |
323 | res <<= 7; res |= (byte & 127); |
324 | byte = *(--ptr); if (byte < 128) goto done; |
325 | res <<= 7; res |= (byte & 127); |
326 | byte = *(--ptr); if (byte < 128) goto done; |
327 | res <<= 7; res |= (byte & 127); |
328 | byte = *(--ptr); if (byte < 128) goto done; |
329 | |
330 | return NULL; // Value is too long to be a varint64 |
331 | |
332 | done: |
333 | *OUTPUT = res; |
334 | return reinterpret_cast<const char*>(ptr + 1); |
335 | } else { |
336 | return Parse64BackwardSlow(p, b, OUTPUT); |
337 | } |
338 | } |
339 | |
340 | inline const char* Varint::Skip64Backward(const char* p, const char* b) { |
341 | if (p > b + kMax64) { |
342 | // Fast path |
343 | const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p); |
344 | if (*(--ptr) > 127) return NULL; |
345 | if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1); |
346 | if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1); |
347 | if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1); |
348 | if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1); |
349 | if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1); |
350 | if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1); |
351 | if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1); |
352 | if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1); |
353 | if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1); |
354 | if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1); |
355 | return NULL; // value is too long to be a varint64 |
356 | } else { |
357 | return Skip64BackwardSlow(p, b); |
358 | } |
359 | } |
360 | |
361 | inline const char* Varint::DecodeTwo32Values(const char* p, |
362 | uint32* a, uint32* b) { |
363 | const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p); |
364 | if (*ptr < 128) { |
365 | // Special case for small values |
366 | *a = (*ptr & 0xf); |
367 | *b = *ptr >> 4; |
368 | return reinterpret_cast<const char*>(ptr) + 1; |
369 | } else { |
370 | return DecodeTwo32ValuesSlow(p, a, b); |
371 | } |
372 | } |
373 | |
374 | #if (defined __i386__ || defined __x86_64__) && defined __GNUC__ |
375 | inline int Varint::Length32(uint32 v) { |
376 | // Find the rightmost bit set, and index into a small table |
377 | |
378 | // "ro" for the input spec means the input can come from either a |
379 | // register ("r") or offsetable memory ("o"). |
380 | // |
381 | // If "n == 0", the "bsr" instruction sets the "Z" flag, so we |
382 | // conditionally move "-1" into the result. |
383 | // |
384 | // Note: the cmovz was introduced on PIII's, and may not work on |
385 | // older machines. |
386 | int bits; |
387 | const int neg1 = -1; |
388 | __asm__("bsr %1, %0\n\t" |
389 | "cmovz %2, %0" |
390 | : "=&r" (bits) // Output spec, early clobber |
391 | : "ro" (v), "r" (neg1) // Input spec |
392 | : "cc" // Clobbers condition-codes |
393 | ); |
394 | return Varint::length32_bytes_required[bits+1]; |
395 | } |
396 | |
397 | #else |
398 | inline int Varint::Length32(uint32 v) { |
399 | /* |
400 | The following version is about 1.5X the code size, but is faster than |
401 | the loop below. |
402 | |
403 | if (v < (1u << 14)) { |
404 | if (v < (1u << 7)) { |
405 | return 1; |
406 | } else { |
407 | return 2; |
408 | } |
409 | } else { |
410 | if (v < (1u << 28)) { |
411 | if (v < (1u << 21)) { |
412 | return 3; |
413 | } else { |
414 | return 4; |
415 | } |
416 | } else { |
417 | return 5; |
418 | } |
419 | } |
420 | */ |
421 | |
422 | // Each byte of output stores 7 bits of "v" until "v" becomes zero |
423 | int nbytes = 0; |
424 | do { |
425 | nbytes++; |
426 | v >>= 7; |
427 | } while (v != 0); |
428 | return nbytes; |
429 | } |
430 | #endif |
431 | |
432 | inline void Varint::Append32(string* s, uint32 value) { |
433 | // Inline the fast-path for single-character output, but fall back to the .cc |
434 | // file for the full version. The size<capacity check is so the compiler can |
435 | // optimize out the string resize code. |
436 | if (value < 128 && s->size() < s->capacity()) { |
437 | s->push_back((unsigned char)value); |
438 | } else { |
439 | Append32Slow(s, value); |
440 | } |
441 | } |
442 | |
443 | inline void Varint::Append64(string* s, uint64 value) { |
444 | // Inline the fast-path for single-character output, but fall back to the .cc |
445 | // file for the full version. The size<capacity check is so the compiler can |
446 | // optimize out the string resize code. |
447 | if (value < 128 && s->size() < s->capacity()) { |
448 | s->push_back((unsigned char)value); |
449 | } else { |
450 | Append64Slow(s, value); |
451 | } |
452 | } |
453 | |
454 | inline char* Varint::Encode32Inline(char* sptr, uint32 v) { |
455 | // Operate on characters as unsigneds |
456 | unsigned char* ptr = reinterpret_cast<unsigned char*>(sptr); |
457 | static const int B = 128; |
458 | if (v < (1<<7)) { |
459 | *(ptr++) = v; |
460 | } else if (v < (1<<14)) { |
461 | *(ptr++) = v | B; |
462 | *(ptr++) = v>>7; |
463 | } else if (v < (1<<21)) { |
464 | *(ptr++) = v | B; |
465 | *(ptr++) = (v>>7) | B; |
466 | *(ptr++) = v>>14; |
467 | } else if (v < (1<<28)) { |
468 | *(ptr++) = v | B; |
469 | *(ptr++) = (v>>7) | B; |
470 | *(ptr++) = (v>>14) | B; |
471 | *(ptr++) = v>>21; |
472 | } else { |
473 | *(ptr++) = v | B; |
474 | *(ptr++) = (v>>7) | B; |
475 | *(ptr++) = (v>>14) | B; |
476 | *(ptr++) = (v>>21) | B; |
477 | *(ptr++) = v>>28; |
478 | } |
479 | return reinterpret_cast<char*>(ptr); |
480 | } |
481 | |
482 | #if (-1 >> 1) != -1 |
483 | #error FastDecodeDeltas() needs right-shift to sign-extend. |
484 | #endif |
485 | inline const char* Varint::FastDecodeDeltas(const char* ptr, |
486 | int64 goal, |
487 | int64* out) { |
488 | int64 value; |
489 | int64 sum = - goal; |
490 | int64 shift = 0; |
491 | // Make decoding faster by eliminating unpredictable branching. |
492 | do { |
493 | value = static_cast<int8>(*ptr++); // sign extend one byte of data |
494 | sum += (value & 0x7F) << shift; |
495 | shift += 7; |
496 | // (value >> 7) is either -1(continuation byte) or 0 (stop byte) |
497 | shift &= value >> 7; |
498 | // Loop if we haven't reached goal (sum < 0) or we haven't finished |
499 | // parsing current delta (value < 0). We write it in the form of |
500 | // (a | b) < 0 as opposed to (a < 0 || b < 0) as the former one is |
501 | // usually as fast as a test for (a < 0). |
502 | } while ((sum | value) < 0); |
503 | |
504 | *out = goal + sum; |
505 | return ptr; |
506 | } |
507 | |
508 | #endif /* _VARINT_H */ |
509 | |