1// Copyright 2001 and onwards Google Inc.
2//
3// Raw support for varint encoding. Higher level interfaces are
4// provided by Encoder/Decoder/IOBuffer. Clients should typically use
5// those interfaces, unless speed is paramount.
6//
7// If decoding speed is very important, consider using PrefixVarint instead.
8// It has the same compression ratio, but generally faster decoding.
9//
10// Provided routines:
11// vi_parse32_unchecked
12// vi_parse64_unchecked
13// vi_encode32_unchecked
14// vi_encode64_unchecked
15
16#ifndef _VARINT_H
17#define _VARINT_H
18
19#include <string>
20using std::string;
21
22#include "base/basictypes.h"
23
24// Just a namespace, not a real class
25class Varint {
26 public:
27 // Maximum lengths of varint encoding of uint32 and uint64
28 static const int kMax32 = 5;
29 static const int kMax64 = 10;
30
31 // REQUIRES "ptr" points to a buffer of length at least kMaxXX
32 // EFFECTS Scan next varint from "ptr" and store in OUTPUT.
33 // Returns pointer just past last read byte. Returns
34 // NULL if a valid varint value was not found.
35 static const char* Parse32(const char* ptr, uint32* OUTPUT);
36 static const char* Parse64(const char* ptr, uint64* OUTPUT);
37
38 // A fully inlined version of Parse32: useful in the most time critical
39 // routines, but its code size is large
40 static const char* Parse32Inline(const char* ptr, uint32* OUTPUT);
41
42 // REQUIRES "ptr" points just past the last byte of a varint-encoded value.
43 // REQUIRES A second varint must be encoded just before the one we parse,
44 // OR "base" must point to the first byte of the one we parse.
45 // REQUIRES Bytes [base, ptr-1] are readable
46 //
47 // EFFECTS Scan backwards from "ptr" and store in OUTPUT. Stop at the last
48 // byte of the previous varint, OR at "base", whichever one comes
49 // first. Returns pointer to the first byte of the decoded varint
50 // NULL if a valid varint value was not found.
51 static const char* Parse32Backward(const char* ptr, const char* base,
52 uint32* OUTPUT);
53 static const char* Parse64Backward(const char* ptr, const char* base,
54 uint64* OUTPUT);
55
56 // Attempts to parse a varint32 from a prefix of the bytes in [ptr,limit-1].
57 // Never reads a character at or beyond limit. If a valid/terminated varint32
58 // was found in the range, stores it in *OUTPUT and returns a pointer just
59 // past the last byte of the varint32. Else returns NULL. On success,
60 // "result <= limit".
61 static const char* Parse32WithLimit(const char* ptr, const char* limit,
62 uint32* OUTPUT);
63 static const char* Parse64WithLimit(const char* ptr, const char* limit,
64 uint64* OUTPUT);
65
66 // REQUIRES "ptr" points to the first byte of a varint-encoded value.
67 // EFFECTS Scans until the end of the varint and returns a pointer just
68 // past the last byte. Returns NULL if "ptr" does not point to
69 // a valid varint value.
70 static const char* Skip32(const char* ptr);
71 static const char* Skip64(const char* ptr);
72
73 // REQUIRES "ptr" points just past the last byte of a varint-encoded value.
74 // REQUIRES A second varint must be encoded just before the one we parse,
75 // OR "base" must point to the first byte of the one we parse.
76 // REQUIRES Bytes [base, ptr-1] are readable
77 //
78 // EFFECTS Scan backwards from "ptr" and stop at the last byte of the
79 // previous varint, OR at "base", whichever one comes first.
80 // Returns pointer to the first byte of the skipped varint or
81 // NULL if a valid varint value was not found.
82 static const char* Skip32Backward(const char* ptr, const char* base);
83 static const char* Skip64Backward(const char* ptr, const char* base);
84
85 // REQUIRES "ptr" points to a buffer of length sufficient to hold "v".
86 // EFFECTS Encodes "v" into "ptr" and returns a pointer to the
87 // byte just past the last encoded byte.
88 static char* Encode32(char* ptr, uint32 v);
89 static char* Encode64(char* ptr, uint64 v);
90
91 // A fully inlined version of Encode32: useful in the most time critical
92 // routines, but its code size is large
93 static char* Encode32Inline(char* ptr, uint32 v);
94
95 // EFFECTS Returns the encoding length of the specified value.
96 static int Length32(uint32 v);
97 static int Length64(uint64 v);
98
99 static int Length32NonInline(uint32 v);
100
101 // EFFECTS Appends the varint representation of "value" to "*s".
102 static void Append32(string* s, uint32 value);
103 static void Append64(string* s, uint64 value);
104
105 // EFFECTS Encodes a pair of values to "*s". The encoding
106 // is done by weaving together 4 bit groups of
107 // each number into a single 64 bit value, and then
108 // encoding this value as a Varint64 value. This means
109 // that if both a and b are small, both values can be
110 // encoded in a single byte.
111 static void EncodeTwo32Values(string* s, uint32 a, uint32 b);
112 static const char* DecodeTwo32Values(const char* ptr, uint32* a, uint32* b);
113
114 // Decode and sum up a sequence of deltas until the sum >= goal.
115 // It is significantly faster than calling ParseXXInline in a loop.
116 // NOTE(user): The code does NO error checking, it assumes all the
117 // deltas are valid and the sum of deltas will never exceed kint64max. The
118 // code works for both 32bits and 64bits varint, and on 64 bits machines,
119 // the 64 bits version is almost always faster. Thus we only have a 64 bits
120 // interface here. The interface is slightly different from the other
121 // functions in that it requires *signed* integers.
122 // REQUIRES "ptr" points to the first byte of a varint-encoded delta.
123 // The sum of deltas >= goal (the code does NO boundary check).
124 // goal is positive and fit into a signed int64.
125 // EFFECTS Returns a pointer just past last read byte.
126 // "out" stores the actual sum.
127 static const char* FastDecodeDeltas(const char* ptr, int64 goal, int64* out);
128
129 private:
130 static const char* Parse32FallbackInline(const char* p, uint32* val);
131 static const char* Parse32Fallback(const char* p, uint32* val);
132 static const char* Parse64Fallback(const char* p, uint64* val);
133
134 static char* Encode32Fallback(char* ptr, uint32 v);
135
136 static const char* DecodeTwo32ValuesSlow(const char* p, uint32* a, uint32* b);
137 static const char* Parse32BackwardSlow(const char* ptr, const char* base,
138 uint32* OUTPUT);
139 static const char* Parse64BackwardSlow(const char* ptr, const char* base,
140 uint64* OUTPUT);
141 static const char* Skip32BackwardSlow(const char* ptr, const char* base);
142 static const char* Skip64BackwardSlow(const char* ptr, const char* base);
143
144 static void Append32Slow(string* s, uint32 value);
145 static void Append64Slow(string* s, uint64 value);
146
147 // Mapping from rightmost bit set to the number of bytes required
148 static const char length32_bytes_required[33];
149};
150
151/***** Implementation details; clients should ignore *****/
152
153inline const char* Varint::Parse32FallbackInline(const char* p,
154 uint32* OUTPUT) {
155 // Fast path
156 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p);
157 uint32 byte, result;
158 byte = *(ptr++); result = byte & 127;
159 assert(byte >= 128); // Already checked in inlined prelude
160 byte = *(ptr++); result |= (byte & 127) << 7; if (byte < 128) goto done;
161 byte = *(ptr++); result |= (byte & 127) << 14; if (byte < 128) goto done;
162 byte = *(ptr++); result |= (byte & 127) << 21; if (byte < 128) goto done;
163 byte = *(ptr++); result |= (byte & 127) << 28; if (byte < 128) goto done;
164 return NULL; // Value is too long to be a varint32
165 done:
166 *OUTPUT = result;
167 return reinterpret_cast<const char*>(ptr);
168}
169
170inline const char* Varint::Parse32(const char* p, uint32* OUTPUT) {
171 // Fast path for inlining
172 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p);
173 uint32 byte = *ptr;
174 if (byte < 128) {
175 *OUTPUT = byte;
176 return reinterpret_cast<const char*>(ptr) + 1;
177 } else {
178 return Parse32Fallback(p, OUTPUT);
179 }
180}
181
182inline const char* Varint::Parse32Inline(const char* p, uint32* OUTPUT) {
183 // Fast path for inlining
184 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p);
185 uint32 byte = *ptr;
186 if (byte < 128) {
187 *OUTPUT = byte;
188 return reinterpret_cast<const char*>(ptr) + 1;
189 } else {
190 return Parse32FallbackInline(p, OUTPUT);
191 }
192}
193
194inline const char* Varint::Skip32(const char* p) {
195 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p);
196 if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr);
197 if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr);
198 if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr);
199 if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr);
200 if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr);
201 return NULL; // value is too long to be a varint32
202}
203
204inline const char* Varint::Parse32Backward(const char* p, const char* base,
205 uint32* OUTPUT) {
206 if (p > base + kMax32) {
207 // Fast path
208 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p);
209 uint32 byte, result;
210 byte = *(--ptr); if (byte > 127) return NULL;
211 result = byte;
212 byte = *(--ptr); if (byte < 128) goto done;
213 result <<= 7; result |= (byte & 127);
214 byte = *(--ptr); if (byte < 128) goto done;
215 result <<= 7; result |= (byte & 127);
216 byte = *(--ptr); if (byte < 128) goto done;
217 result <<= 7; result |= (byte & 127);
218 byte = *(--ptr); if (byte < 128) goto done;
219 result <<= 7; result |= (byte & 127);
220 byte = *(--ptr); if (byte < 128) goto done;
221 return NULL; // Value is too long to be a varint32
222 done:
223 *OUTPUT = result;
224 return reinterpret_cast<const char*>(ptr+1);
225 } else {
226 return Parse32BackwardSlow(p, base, OUTPUT);
227 }
228}
229
230inline const char* Varint::Skip32Backward(const char* p, const char* base) {
231 if (p > base + kMax32) {
232 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p);
233 if (*(--ptr) > 127) return NULL;
234 if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1);
235 if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1);
236 if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1);
237 if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1);
238 if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1);
239 return NULL; // value is too long to be a varint32
240 } else {
241 return Skip32BackwardSlow(p, base);
242 }
243}
244
245inline const char* Varint::Parse32WithLimit(const char* p,
246 const char* l,
247 uint32* OUTPUT) {
248 if (p + kMax32 <= l) {
249 return Parse32(p, OUTPUT);
250 } else {
251 // Slow version with bounds checks
252 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p);
253 const unsigned char* limit = reinterpret_cast<const unsigned char*>(l);
254 uint32 b, result;
255 if (ptr >= limit) return NULL;
256 b = *(ptr++); result = b & 127; if (b < 128) goto done;
257 if (ptr >= limit) return NULL;
258 b = *(ptr++); result |= (b & 127) << 7; if (b < 128) goto done;
259 if (ptr >= limit) return NULL;
260 b = *(ptr++); result |= (b & 127) << 14; if (b < 128) goto done;
261 if (ptr >= limit) return NULL;
262 b = *(ptr++); result |= (b & 127) << 21; if (b < 128) goto done;
263 if (ptr >= limit) return NULL;
264 b = *(ptr++); result |= (b & 127) << 28; if (b < 16) goto done;
265 return NULL; // Value is too long to be a varint32
266 done:
267 *OUTPUT = result;
268 return reinterpret_cast<const char*>(ptr);
269 }
270
271}
272
273inline const char* Varint::Parse64(const char* p, uint64* OUTPUT) {
274 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p);
275 uint32 byte = *ptr;
276 if (byte < 128) {
277 *OUTPUT = byte;
278 return reinterpret_cast<const char*>(ptr) + 1;
279 } else {
280 return Parse64Fallback(p, OUTPUT);
281 }
282}
283
284inline const char* Varint::Skip64(const char* p) {
285 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p);
286 if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr);
287 if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr);
288 if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr);
289 if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr);
290 if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr);
291 if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr);
292 if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr);
293 if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr);
294 if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr);
295 if (*ptr++ < 128) return reinterpret_cast<const char*>(ptr);
296 return NULL; // value is too long to be a varint64
297}
298
299inline const char* Varint::Parse64Backward(const char* p, const char* b,
300 uint64* OUTPUT) {
301 if (p > b + kMax64) {
302 // Fast path
303 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p);
304 uint32 byte;
305 uint64 res;
306
307 byte = *(--ptr); if (byte > 127) return NULL;
308
309 res = byte;
310 byte = *(--ptr); if (byte < 128) goto done;
311 res <<= 7; res |= (byte & 127);
312 byte = *(--ptr); if (byte < 128) goto done;
313 res <<= 7; res |= (byte & 127);
314 byte = *(--ptr); if (byte < 128) goto done;
315 res <<= 7; res |= (byte & 127);
316 byte = *(--ptr); if (byte < 128) goto done;
317 res <<= 7; res |= (byte & 127);
318 byte = *(--ptr); if (byte < 128) goto done;
319 res <<= 7; res |= (byte & 127);
320 byte = *(--ptr); if (byte < 128) goto done;
321 res <<= 7; res |= (byte & 127);
322 byte = *(--ptr); if (byte < 128) goto done;
323 res <<= 7; res |= (byte & 127);
324 byte = *(--ptr); if (byte < 128) goto done;
325 res <<= 7; res |= (byte & 127);
326 byte = *(--ptr); if (byte < 128) goto done;
327 res <<= 7; res |= (byte & 127);
328 byte = *(--ptr); if (byte < 128) goto done;
329
330 return NULL; // Value is too long to be a varint64
331
332 done:
333 *OUTPUT = res;
334 return reinterpret_cast<const char*>(ptr + 1);
335 } else {
336 return Parse64BackwardSlow(p, b, OUTPUT);
337 }
338}
339
340inline const char* Varint::Skip64Backward(const char* p, const char* b) {
341 if (p > b + kMax64) {
342 // Fast path
343 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p);
344 if (*(--ptr) > 127) return NULL;
345 if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1);
346 if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1);
347 if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1);
348 if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1);
349 if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1);
350 if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1);
351 if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1);
352 if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1);
353 if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1);
354 if (*(--ptr) < 128) return reinterpret_cast<const char*>(ptr+1);
355 return NULL; // value is too long to be a varint64
356 } else {
357 return Skip64BackwardSlow(p, b);
358 }
359}
360
361inline const char* Varint::DecodeTwo32Values(const char* p,
362 uint32* a, uint32* b) {
363 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(p);
364 if (*ptr < 128) {
365 // Special case for small values
366 *a = (*ptr & 0xf);
367 *b = *ptr >> 4;
368 return reinterpret_cast<const char*>(ptr) + 1;
369 } else {
370 return DecodeTwo32ValuesSlow(p, a, b);
371 }
372}
373
374#if (defined __i386__ || defined __x86_64__) && defined __GNUC__
375inline int Varint::Length32(uint32 v) {
376 // Find the rightmost bit set, and index into a small table
377
378 // "ro" for the input spec means the input can come from either a
379 // register ("r") or offsetable memory ("o").
380 //
381 // If "n == 0", the "bsr" instruction sets the "Z" flag, so we
382 // conditionally move "-1" into the result.
383 //
384 // Note: the cmovz was introduced on PIII's, and may not work on
385 // older machines.
386 int bits;
387 const int neg1 = -1;
388 __asm__("bsr %1, %0\n\t"
389 "cmovz %2, %0"
390 : "=&r" (bits) // Output spec, early clobber
391 : "ro" (v), "r" (neg1) // Input spec
392 : "cc" // Clobbers condition-codes
393 );
394 return Varint::length32_bytes_required[bits+1];
395}
396
397#else
398inline int Varint::Length32(uint32 v) {
399 /*
400 The following version is about 1.5X the code size, but is faster than
401 the loop below.
402
403 if (v < (1u << 14)) {
404 if (v < (1u << 7)) {
405 return 1;
406 } else {
407 return 2;
408 }
409 } else {
410 if (v < (1u << 28)) {
411 if (v < (1u << 21)) {
412 return 3;
413 } else {
414 return 4;
415 }
416 } else {
417 return 5;
418 }
419 }
420 */
421
422 // Each byte of output stores 7 bits of "v" until "v" becomes zero
423 int nbytes = 0;
424 do {
425 nbytes++;
426 v >>= 7;
427 } while (v != 0);
428 return nbytes;
429}
430#endif
431
432inline void Varint::Append32(string* s, uint32 value) {
433 // Inline the fast-path for single-character output, but fall back to the .cc
434 // file for the full version. The size<capacity check is so the compiler can
435 // optimize out the string resize code.
436 if (value < 128 && s->size() < s->capacity()) {
437 s->push_back((unsigned char)value);
438 } else {
439 Append32Slow(s, value);
440 }
441}
442
443inline void Varint::Append64(string* s, uint64 value) {
444 // Inline the fast-path for single-character output, but fall back to the .cc
445 // file for the full version. The size<capacity check is so the compiler can
446 // optimize out the string resize code.
447 if (value < 128 && s->size() < s->capacity()) {
448 s->push_back((unsigned char)value);
449 } else {
450 Append64Slow(s, value);
451 }
452}
453
454inline char* Varint::Encode32Inline(char* sptr, uint32 v) {
455 // Operate on characters as unsigneds
456 unsigned char* ptr = reinterpret_cast<unsigned char*>(sptr);
457 static const int B = 128;
458 if (v < (1<<7)) {
459 *(ptr++) = v;
460 } else if (v < (1<<14)) {
461 *(ptr++) = v | B;
462 *(ptr++) = v>>7;
463 } else if (v < (1<<21)) {
464 *(ptr++) = v | B;
465 *(ptr++) = (v>>7) | B;
466 *(ptr++) = v>>14;
467 } else if (v < (1<<28)) {
468 *(ptr++) = v | B;
469 *(ptr++) = (v>>7) | B;
470 *(ptr++) = (v>>14) | B;
471 *(ptr++) = v>>21;
472 } else {
473 *(ptr++) = v | B;
474 *(ptr++) = (v>>7) | B;
475 *(ptr++) = (v>>14) | B;
476 *(ptr++) = (v>>21) | B;
477 *(ptr++) = v>>28;
478 }
479 return reinterpret_cast<char*>(ptr);
480}
481
482#if (-1 >> 1) != -1
483#error FastDecodeDeltas() needs right-shift to sign-extend.
484#endif
485inline const char* Varint::FastDecodeDeltas(const char* ptr,
486 int64 goal,
487 int64* out) {
488 int64 value;
489 int64 sum = - goal;
490 int64 shift = 0;
491 // Make decoding faster by eliminating unpredictable branching.
492 do {
493 value = static_cast<int8>(*ptr++); // sign extend one byte of data
494 sum += (value & 0x7F) << shift;
495 shift += 7;
496 // (value >> 7) is either -1(continuation byte) or 0 (stop byte)
497 shift &= value >> 7;
498 // Loop if we haven't reached goal (sum < 0) or we haven't finished
499 // parsing current delta (value < 0). We write it in the form of
500 // (a | b) < 0 as opposed to (a < 0 || b < 0) as the former one is
501 // usually as fast as a test for (a < 0).
502 } while ((sum | value) < 0);
503
504 *out = goal + sum;
505 return ptr;
506}
507
508#endif /* _VARINT_H */
509