1/*
2 * Copyright 2012-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#pragma once
18
19#include <cstdint>
20#include <limits>
21
22#include <glog/logging.h>
23
24#if !defined(__GNUC__) && !defined(_MSC_VER)
25#error GroupVarint.h requires GCC or MSVC
26#endif
27
28#include <folly/Portability.h>
29
30#if FOLLY_X64 || defined(__i386__) || FOLLY_PPC64 || FOLLY_AARCH64
31#define HAVE_GROUP_VARINT 1
32
33#include <folly/Range.h>
34#include <folly/detail/GroupVarintDetail.h>
35#include <folly/lang/Bits.h>
36#include <folly/portability/Builtins.h>
37
38#if FOLLY_SSE >= 3
39#include <nmmintrin.h>
40namespace folly {
41namespace detail {
42extern const std::array<std::array<std::uint32_t, 4>, 256> groupVarintSSEMasks;
43} // namespace detail
44} // namespace folly
45#endif
46
47namespace folly {
48namespace detail {
49extern const std::array<std::uint8_t, 256> groupVarintLengths;
50} // namespace detail
51} // namespace folly
52
53namespace folly {
54
55template <typename T>
56class GroupVarint;
57
58/**
59 * GroupVarint encoding for 32-bit values.
60 *
61 * Encodes 4 32-bit integers at once, each using 1-4 bytes depending on size.
62 * There is one byte of overhead. (The first byte contains the lengths of
63 * the four integers encoded as two bits each; 00=1 byte .. 11=4 bytes)
64 *
65 * This implementation assumes little-endian and does unaligned 32-bit
66 * accesses, so it's basically not portable outside of the x86[_64] world.
67 */
68template <>
69class GroupVarint<uint32_t> : public detail::GroupVarintBase<uint32_t> {
70 public:
71 /**
72 * Return the number of bytes used to encode these four values.
73 */
74 static size_t size(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
75 return kHeaderSize + kGroupSize + key(a) + key(b) + key(c) + key(d);
76 }
77
78 /**
79 * Return the number of bytes used to encode four uint32_t values stored
80 * at consecutive positions in an array.
81 */
82 static size_t size(const uint32_t* p) {
83 return size(p[0], p[1], p[2], p[3]);
84 }
85
86 /**
87 * Return the number of bytes used to encode count (<= 4) values.
88 * If you clip a buffer after these many bytes, you can still decode
89 * the first "count" values correctly (if the remaining size() -
90 * partialSize() bytes are filled with garbage).
91 */
92 static size_t partialSize(const type* p, size_t count) {
93 DCHECK_LE(count, kGroupSize);
94 size_t s = kHeaderSize + count;
95 for (; count; --count, ++p) {
96 s += key(*p);
97 }
98 return s;
99 }
100
101 /**
102 * Return the number of values from *p that are valid from an encoded
103 * buffer of size bytes.
104 */
105 static size_t partialCount(const char* p, size_t size) {
106 uint8_t v = uint8_t(*p);
107 size_t s = kHeaderSize;
108 s += 1 + b0key(v);
109 if (s > size) {
110 return 0;
111 }
112 s += 1 + b1key(v);
113 if (s > size) {
114 return 1;
115 }
116 s += 1 + b2key(v);
117 if (s > size) {
118 return 2;
119 }
120 s += 1 + b3key(v);
121 if (s > size) {
122 return 3;
123 }
124 return 4;
125 }
126
127 /**
128 * Given a pointer to the beginning of an GroupVarint32-encoded block,
129 * return the number of bytes used by the encoding.
130 */
131 static size_t encodedSize(const char* p) {
132 return kHeaderSize + kGroupSize + b0key(uint8_t(*p)) + b1key(uint8_t(*p)) +
133 b2key(uint8_t(*p)) + b3key(uint8_t(*p));
134 }
135
136 /**
137 * Encode four uint32_t values into the buffer pointed-to by p, and return
138 * the next position in the buffer (that is, one character past the last
139 * encoded byte). p needs to have at least size()+4 bytes available.
140 */
141 static char* encode(char* p, uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
142 uint8_t b0key = key(a);
143 uint8_t b1key = key(b);
144 uint8_t b2key = key(c);
145 uint8_t b3key = key(d);
146 *p++ = (b3key << 6) | (b2key << 4) | (b1key << 2) | b0key;
147 storeUnaligned(p, a);
148 p += b0key + 1;
149 storeUnaligned(p, b);
150 p += b1key + 1;
151 storeUnaligned(p, c);
152 p += b2key + 1;
153 storeUnaligned(p, d);
154 p += b3key + 1;
155 return p;
156 }
157
158 /**
159 * Encode four uint32_t values from the array pointed-to by src into the
160 * buffer pointed-to by p, similar to encode(p,a,b,c,d) above.
161 */
162 static char* encode(char* p, const uint32_t* src) {
163 return encode(p, src[0], src[1], src[2], src[3]);
164 }
165
166 /**
167 * Decode four uint32_t values from a buffer, and return the next position
168 * in the buffer (that is, one character past the last encoded byte).
169 * The buffer needs to have at least 3 extra bytes available (they
170 * may be read but ignored).
171 */
172 static const char* decode_simple(
173 const char* p,
174 uint32_t* a,
175 uint32_t* b,
176 uint32_t* c,
177 uint32_t* d) {
178 size_t k = loadUnaligned<uint8_t>(p);
179 const char* end = p + detail::groupVarintLengths[k];
180 ++p;
181 size_t k0 = b0key(k);
182 *a = loadUnaligned<uint32_t>(p) & kMask[k0];
183 p += k0 + 1;
184 size_t k1 = b1key(k);
185 *b = loadUnaligned<uint32_t>(p) & kMask[k1];
186 p += k1 + 1;
187 size_t k2 = b2key(k);
188 *c = loadUnaligned<uint32_t>(p) & kMask[k2];
189 p += k2 + 1;
190 size_t k3 = b3key(k);
191 *d = loadUnaligned<uint32_t>(p) & kMask[k3];
192 // p += k3+1;
193 return end;
194 }
195
196 /**
197 * Decode four uint32_t values from a buffer and store them in the array
198 * pointed-to by dest, similar to decode(p,a,b,c,d) above.
199 */
200 static const char* decode_simple(const char* p, uint32_t* dest) {
201 return decode_simple(p, dest, dest + 1, dest + 2, dest + 3);
202 }
203
204#if FOLLY_SSE >= 3
205 /**
206 * Just like the non-SSSE3 decode below, but with the additional constraint
207 * that we must be able to read at least 17 bytes from the input pointer, p.
208 */
209 static const char* decode(const char* p, uint32_t* dest) {
210 uint8_t key = uint8_t(p[0]);
211 __m128i val = _mm_loadu_si128((const __m128i*)(p + 1));
212 __m128i mask =
213 _mm_load_si128((const __m128i*)detail::groupVarintSSEMasks[key].data());
214 __m128i r = _mm_shuffle_epi8(val, mask);
215 _mm_storeu_si128((__m128i*)dest, r);
216 return p + detail::groupVarintLengths[key];
217 }
218
219 /**
220 * Just like decode_simple, but with the additional constraint that
221 * we must be able to read at least 17 bytes from the input pointer, p.
222 */
223 static const char*
224 decode(const char* p, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) {
225 uint8_t key = uint8_t(p[0]);
226 __m128i val = _mm_loadu_si128((const __m128i*)(p + 1));
227 __m128i mask =
228 _mm_load_si128((const __m128i*)detail::groupVarintSSEMasks[key].data());
229 __m128i r = _mm_shuffle_epi8(val, mask);
230
231 // Extracting 32 bits at a time out of an XMM register is a SSE4 feature
232#if FOLLY_SSE >= 4
233 *a = uint32_t(_mm_extract_epi32(r, 0));
234 *b = uint32_t(_mm_extract_epi32(r, 1));
235 *c = uint32_t(_mm_extract_epi32(r, 2));
236 *d = uint32_t(_mm_extract_epi32(r, 3));
237#else /* !__SSE4__ */
238 *a = _mm_extract_epi16(r, 0) + (_mm_extract_epi16(r, 1) << 16);
239 *b = _mm_extract_epi16(r, 2) + (_mm_extract_epi16(r, 3) << 16);
240 *c = _mm_extract_epi16(r, 4) + (_mm_extract_epi16(r, 5) << 16);
241 *d = _mm_extract_epi16(r, 6) + (_mm_extract_epi16(r, 7) << 16);
242#endif /* __SSE4__ */
243
244 return p + detail::groupVarintLengths[key];
245 }
246
247#else /* !__SSSE3__ */
248 static const char*
249 decode(const char* p, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) {
250 return decode_simple(p, a, b, c, d);
251 }
252
253 static const char* decode(const char* p, uint32_t* dest) {
254 return decode_simple(p, dest);
255 }
256#endif /* __SSSE3__ */
257
258 private:
259 static uint8_t key(uint32_t x) {
260 // __builtin_clz is undefined for the x==0 case
261 return uint8_t(3 - (__builtin_clz(x | 1) / 8));
262 }
263 static size_t b0key(size_t x) {
264 return x & 3;
265 }
266 static size_t b1key(size_t x) {
267 return (x >> 2) & 3;
268 }
269 static size_t b2key(size_t x) {
270 return (x >> 4) & 3;
271 }
272 static size_t b3key(size_t x) {
273 return (x >> 6) & 3;
274 }
275
276 static const uint32_t kMask[];
277};
278
279/**
280 * GroupVarint encoding for 64-bit values.
281 *
282 * Encodes 5 64-bit integers at once, each using 1-8 bytes depending on size.
283 * There are two bytes of overhead. (The first two bytes contain the lengths
284 * of the five integers encoded as three bits each; 000=1 byte .. 111 = 8 bytes)
285 *
286 * This implementation assumes little-endian and does unaligned 64-bit
287 * accesses, so it's basically not portable outside of the x86[_64] world.
288 */
289template <>
290class GroupVarint<uint64_t> : public detail::GroupVarintBase<uint64_t> {
291 public:
292 /**
293 * Return the number of bytes used to encode these five values.
294 */
295 static size_t
296 size(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e) {
297 return kHeaderSize + kGroupSize + key(a) + key(b) + key(c) + key(d) +
298 key(e);
299 }
300
301 /**
302 * Return the number of bytes used to encode five uint64_t values stored
303 * at consecutive positions in an array.
304 */
305 static size_t size(const uint64_t* p) {
306 return size(p[0], p[1], p[2], p[3], p[4]);
307 }
308
309 /**
310 * Return the number of bytes used to encode count (<= 4) values.
311 * If you clip a buffer after these many bytes, you can still decode
312 * the first "count" values correctly (if the remaining size() -
313 * partialSize() bytes are filled with garbage).
314 */
315 static size_t partialSize(const type* p, size_t count) {
316 DCHECK_LE(count, kGroupSize);
317 size_t s = kHeaderSize + count;
318 for (; count; --count, ++p) {
319 s += key(*p);
320 }
321 return s;
322 }
323
324 /**
325 * Return the number of values from *p that are valid from an encoded
326 * buffer of size bytes.
327 */
328 static size_t partialCount(const char* p, size_t size) {
329 uint16_t v = loadUnaligned<uint16_t>(p);
330 size_t s = kHeaderSize;
331 s += 1 + b0key(v);
332 if (s > size) {
333 return 0;
334 }
335 s += 1 + b1key(v);
336 if (s > size) {
337 return 1;
338 }
339 s += 1 + b2key(v);
340 if (s > size) {
341 return 2;
342 }
343 s += 1 + b3key(v);
344 if (s > size) {
345 return 3;
346 }
347 s += 1 + b4key(v);
348 if (s > size) {
349 return 4;
350 }
351 return 5;
352 }
353
354 /**
355 * Given a pointer to the beginning of an GroupVarint64-encoded block,
356 * return the number of bytes used by the encoding.
357 */
358 static size_t encodedSize(const char* p) {
359 uint16_t n = loadUnaligned<uint16_t>(p);
360 return kHeaderSize + kGroupSize + b0key(n) + b1key(n) + b2key(n) +
361 b3key(n) + b4key(n);
362 }
363
364 /**
365 * Encode five uint64_t values into the buffer pointed-to by p, and return
366 * the next position in the buffer (that is, one character past the last
367 * encoded byte). p needs to have at least size()+8 bytes available.
368 */
369 static char*
370 encode(char* p, uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e) {
371 uint16_t b0key = key(a);
372 uint16_t b1key = key(b);
373 uint16_t b2key = key(c);
374 uint16_t b3key = key(d);
375 uint16_t b4key = key(e);
376 storeUnaligned<uint16_t>(
377 p,
378 uint16_t(
379 (b4key << 12) | (b3key << 9) | (b2key << 6) | (b1key << 3) |
380 b0key));
381 p += 2;
382 storeUnaligned(p, a);
383 p += b0key + 1;
384 storeUnaligned(p, b);
385 p += b1key + 1;
386 storeUnaligned(p, c);
387 p += b2key + 1;
388 storeUnaligned(p, d);
389 p += b3key + 1;
390 storeUnaligned(p, e);
391 p += b4key + 1;
392 return p;
393 }
394
395 /**
396 * Encode five uint64_t values from the array pointed-to by src into the
397 * buffer pointed-to by p, similar to encode(p,a,b,c,d,e) above.
398 */
399 static char* encode(char* p, const uint64_t* src) {
400 return encode(p, src[0], src[1], src[2], src[3], src[4]);
401 }
402
403 /**
404 * Decode five uint64_t values from a buffer, and return the next position
405 * in the buffer (that is, one character past the last encoded byte).
406 * The buffer needs to have at least 7 bytes available (they may be read
407 * but ignored).
408 */
409 static const char* decode(
410 const char* p,
411 uint64_t* a,
412 uint64_t* b,
413 uint64_t* c,
414 uint64_t* d,
415 uint64_t* e) {
416 uint16_t k = loadUnaligned<uint16_t>(p);
417 p += 2;
418 uint8_t k0 = b0key(k);
419 *a = loadUnaligned<uint64_t>(p) & kMask[k0];
420 p += k0 + 1;
421 uint8_t k1 = b1key(k);
422 *b = loadUnaligned<uint64_t>(p) & kMask[k1];
423 p += k1 + 1;
424 uint8_t k2 = b2key(k);
425 *c = loadUnaligned<uint64_t>(p) & kMask[k2];
426 p += k2 + 1;
427 uint8_t k3 = b3key(k);
428 *d = loadUnaligned<uint64_t>(p) & kMask[k3];
429 p += k3 + 1;
430 uint8_t k4 = b4key(k);
431 *e = loadUnaligned<uint64_t>(p) & kMask[k4];
432 p += k4 + 1;
433 return p;
434 }
435
436 /**
437 * Decode five uint64_t values from a buffer and store them in the array
438 * pointed-to by dest, similar to decode(p,a,b,c,d,e) above.
439 */
440 static const char* decode(const char* p, uint64_t* dest) {
441 return decode(p, dest, dest + 1, dest + 2, dest + 3, dest + 4);
442 }
443
444 private:
445 enum { kHeaderBytes = 2 };
446
447 static uint8_t key(uint64_t x) {
448 // __builtin_clzll is undefined for the x==0 case
449 return uint8_t(7 - (__builtin_clzll(x | 1) / 8));
450 }
451
452 static uint8_t b0key(uint16_t x) {
453 return x & 7u;
454 }
455 static uint8_t b1key(uint16_t x) {
456 return (x >> 3) & 7u;
457 }
458 static uint8_t b2key(uint16_t x) {
459 return (x >> 6) & 7u;
460 }
461 static uint8_t b3key(uint16_t x) {
462 return (x >> 9) & 7u;
463 }
464 static uint8_t b4key(uint16_t x) {
465 return (x >> 12) & 7u;
466 }
467
468 static const uint64_t kMask[];
469};
470
471typedef GroupVarint<uint32_t> GroupVarint32;
472typedef GroupVarint<uint64_t> GroupVarint64;
473
474/**
475 * Simplify use of GroupVarint* for the case where data is available one
476 * entry at a time (instead of one group at a time). Handles buffering
477 * and an incomplete last chunk.
478 *
479 * Output is a function object that accepts character ranges:
480 * out(StringPiece) appends the given character range to the output.
481 */
482template <class T, class Output>
483class GroupVarintEncoder {
484 public:
485 typedef GroupVarint<T> Base;
486 typedef T type;
487
488 explicit GroupVarintEncoder(Output out) : out_(out), count_(0) {}
489
490 ~GroupVarintEncoder() {
491 finish();
492 }
493
494 /**
495 * Add a value to the encoder.
496 */
497 void add(type val) {
498 buf_[count_++] = val;
499 if (count_ == Base::kGroupSize) {
500 char* p = Base::encode(tmp_, buf_);
501 out_(StringPiece(tmp_, p));
502 count_ = 0;
503 }
504 }
505
506 /**
507 * Finish encoding, flushing any buffered values if necessary.
508 * After finish(), the encoder is immediately ready to encode more data
509 * to the same output.
510 */
511 void finish() {
512 if (count_) {
513 // This is not strictly necessary, but it makes testing easy;
514 // uninitialized bytes are guaranteed to be recorded as taking one byte
515 // (not more).
516 for (size_t i = count_; i < Base::kGroupSize; i++) {
517 buf_[i] = 0;
518 }
519 Base::encode(tmp_, buf_);
520 out_(StringPiece(tmp_, Base::partialSize(buf_, count_)));
521 count_ = 0;
522 }
523 }
524
525 /**
526 * Return the appender that was used.
527 */
528 Output& output() {
529 return out_;
530 }
531 const Output& output() const {
532 return out_;
533 }
534
535 /**
536 * Reset the encoder, disregarding any state (except what was already
537 * flushed to the output, of course).
538 */
539 void clear() {
540 count_ = 0;
541 }
542
543 private:
544 Output out_;
545 char tmp_[Base::kMaxSize];
546 type buf_[Base::kGroupSize];
547 size_t count_;
548};
549
550/**
551 * Simplify use of GroupVarint* for the case where the last group in the
552 * input may be incomplete (but the exact size of the input is known).
553 * Allows for extracting values one at a time.
554 */
555template <typename T>
556class GroupVarintDecoder {
557 public:
558 typedef GroupVarint<T> Base;
559 typedef T type;
560
561 GroupVarintDecoder() = default;
562
563 explicit GroupVarintDecoder(StringPiece data, size_t maxCount = (size_t)-1)
564 : rrest_(data.end()),
565 p_(data.data()),
566 end_(data.end()),
567 limit_(end_),
568 pos_(0),
569 count_(0),
570 remaining_(maxCount) {}
571
572 void reset(StringPiece data, size_t maxCount = (size_t)-1) {
573 rrest_ = data.end();
574 p_ = data.data();
575 end_ = data.end();
576 limit_ = end_;
577 pos_ = 0;
578 count_ = 0;
579 remaining_ = maxCount;
580 }
581
582 /**
583 * Read and return the next value.
584 */
585 bool next(type* val) {
586 if (pos_ == count_) {
587 // refill
588 size_t rem = size_t(end_ - p_);
589 if (rem == 0 || remaining_ == 0) {
590 return false;
591 }
592 // next() attempts to read one full group at a time, and so we must have
593 // at least enough bytes readable after its end to handle the case if the
594 // last group is full.
595 //
596 // The best way to ensure this is to ensure that data has at least
597 // Base::kMaxSize - 1 bytes readable *after* the end, otherwise we'll copy
598 // into a temporary buffer.
599 if (limit_ - p_ < Base::kMaxSize) {
600 memcpy(tmp_, p_, rem);
601 p_ = tmp_;
602 end_ = p_ + rem;
603 limit_ = tmp_ + sizeof(tmp_);
604 }
605 pos_ = 0;
606 const char* n = Base::decode(p_, buf_);
607 if (n <= end_) {
608 // Full group could be decoded
609 if (remaining_ >= Base::kGroupSize) {
610 remaining_ -= Base::kGroupSize;
611 count_ = Base::kGroupSize;
612 p_ = n;
613 } else {
614 count_ = remaining_;
615 remaining_ = 0;
616 p_ += Base::partialSize(buf_, count_);
617 }
618 } else {
619 // Can't decode a full group
620 count_ = Base::partialCount(p_, size_t(end_ - p_));
621 if (remaining_ >= count_) {
622 remaining_ -= count_;
623 p_ = end_;
624 } else {
625 count_ = remaining_;
626 remaining_ = 0;
627 p_ += Base::partialSize(buf_, count_);
628 }
629 if (count_ == 0) {
630 return false;
631 }
632 }
633 }
634 *val = buf_[pos_++];
635 return true;
636 }
637
638 StringPiece rest() const {
639 // This is only valid after next() returned false
640 CHECK(pos_ == count_ && (p_ == end_ || remaining_ == 0));
641 // p_ may point to the internal buffer (tmp_), but we want
642 // to return subpiece of the original data
643 size_t size = size_t(end_ - p_);
644 return StringPiece(rrest_ - size, rrest_);
645 }
646
647 private:
648 const char* rrest_;
649 const char* p_;
650 const char* end_;
651 const char* limit_;
652 char tmp_[2 * Base::kMaxSize];
653 type buf_[Base::kGroupSize];
654 size_t pos_;
655 size_t count_;
656 size_t remaining_;
657};
658
659typedef GroupVarintDecoder<uint32_t> GroupVarint32Decoder;
660typedef GroupVarintDecoder<uint64_t> GroupVarint64Decoder;
661
662} // namespace folly
663
664#endif /* FOLLY_X64 || defined(__i386__) || FOLLY_PPC64 */
665