GroupVarint.h source code [folly/GroupVarint.h]

1	/*
2	* Copyright 2012-present Facebook, Inc.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	#pragma once
18
19	#include <cstdint>
20	#include <limits>
21
22	#include <glog/logging.h>
23
24	#if !defined(__GNUC__) && !defined(_MSC_VER)
25	#error GroupVarint.h requires GCC or MSVC
26	#endif
27
28	#include <folly/Portability.h>
29
30	#if FOLLY_X64 \|\| defined(__i386__) \|\| FOLLY_PPC64 \|\| FOLLY_AARCH64
31	#define HAVE_GROUP_VARINT 1
32
33	#include <folly/Range.h>
34	#include <folly/detail/GroupVarintDetail.h>
35	#include <folly/lang/Bits.h>
36	#include <folly/portability/Builtins.h>
37
38	#if FOLLY_SSE >= 3
39	#include <nmmintrin.h>
40	namespace folly {
41	namespace detail {
42	extern const std::array<std::array<std::uint32_t, `4`>, `256`> groupVarintSSEMasks;
43	} // namespace detail
44	} // namespace folly
45	#endif
46
47	namespace folly {
48	namespace detail {
49	extern const std::array<std::uint8_t, `256`> groupVarintLengths;
50	} // namespace detail
51	} // namespace folly
52
53	namespace folly {
54
55	template <typename T>
56	class GroupVarint;
57
58	/**
59	* GroupVarint encoding for 32-bit values.
60	*
61	* Encodes 4 32-bit integers at once, each using 1-4 bytes depending on size.
62	* There is one byte of overhead. (The first byte contains the lengths of
63	* the four integers encoded as two bits each; 00=1 byte .. 11=4 bytes)
64	*
65	* This implementation assumes little-endian and does unaligned 32-bit
66	* accesses, so it's basically not portable outside of the x86[_64] world.
67	*/
68	template <>
69	class GroupVarint<uint32_t> : public detail::GroupVarintBase<uint32_t> {
70	public:
71	/**
72	* Return the number of bytes used to encode these four values.
73	*/
74	static size_t size(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
75	return kHeaderSize + kGroupSize + key(a) + key(b) + key(c) + key(d);
76	}
77
78	/**
79	* Return the number of bytes used to encode four uint32_t values stored
80	* at consecutive positions in an array.
81	*/
82	static size_t size(const uint32_t* p) {
83	return size(p[`0`], p[`1`], p[`2`], p[`3`]);
84	}
85
86	/**
87	* Return the number of bytes used to encode count (<= 4) values.
88	* If you clip a buffer after these many bytes, you can still decode
89	* the first "count" values correctly (if the remaining size() -
90	* partialSize() bytes are filled with garbage).
91	*/
92	static size_t partialSize(const type* p, size_t count) {
93	DCHECK_LE(count, kGroupSize);
94	size_t s = kHeaderSize + count;
95	for (; count; --count, ++p) {
96	s += key(*p);
97	}
98	return s;
99	}
100
101	/**
102	* Return the number of values from *p that are valid from an encoded
103	* buffer of size bytes.
104	*/
105	static size_t partialCount(const char* p, size_t size) {
106	uint8_t v = uint8_t(*p);
107	size_t s = kHeaderSize;
108	s += `1` + b0key(v);
109	if (s > size) {
110	return `0`;
111	}
112	s += `1` + b1key(v);
113	if (s > size) {
114	return `1`;
115	}
116	s += `1` + b2key(v);
117	if (s > size) {
118	return `2`;
119	}
120	s += `1` + b3key(v);
121	if (s > size) {
122	return `3`;
123	}
124	return `4`;
125	}
126
127	/**
128	* Given a pointer to the beginning of an GroupVarint32-encoded block,
129	* return the number of bytes used by the encoding.
130	*/
131	static size_t encodedSize(const char* p) {
132	return kHeaderSize + kGroupSize + b0key(uint8_t(p)) + b1key(uint8_t(p)) +
133	b2key(uint8_t(p)) + b3key(uint8_t(p));
134	}
135
136	/**
137	* Encode four uint32_t values into the buffer pointed-to by p, and return
138	* the next position in the buffer (that is, one character past the last
139	* encoded byte). p needs to have at least size()+4 bytes available.
140	*/
141	static char* encode(char* p, uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
142	uint8_t b0key = key(a);
143	uint8_t b1key = key(b);
144	uint8_t b2key = key(c);
145	uint8_t b3key = key(d);
146	*p++ = (b3key << `6`) \| (b2key << `4`) \| (b1key << `2`) \| b0key;
147	storeUnaligned(p, a);
148	p += b0key + `1`;
149	storeUnaligned(p, b);
150	p += b1key + `1`;
151	storeUnaligned(p, c);
152	p += b2key + `1`;
153	storeUnaligned(p, d);
154	p += b3key + `1`;
155	return p;
156	}
157
158	/**
159	* Encode four uint32_t values from the array pointed-to by src into the
160	* buffer pointed-to by p, similar to encode(p,a,b,c,d) above.
161	*/
162	static char* encode(char* p, const uint32_t* src) {
163	return encode(p, src[`0`], src[`1`], src[`2`], src[`3`]);
164	}
165
166	/**
167	* Decode four uint32_t values from a buffer, and return the next position
168	* in the buffer (that is, one character past the last encoded byte).
169	* The buffer needs to have at least 3 extra bytes available (they
170	* may be read but ignored).
171	*/
172	static const char* decode_simple(
173	const char* p,
174	uint32_t* a,
175	uint32_t* b,
176	uint32_t* c,
177	uint32_t* d) {
178	size_t k = loadUnaligned<uint8_t>(p);
179	const char* end = p + detail::groupVarintLengths [k];
180	++p;
181	size_t k0 = b0key(k);
182	*a = loadUnaligned<uint32_t>(p) & kMask[k0];
183	p += k0 + `1`;
184	size_t k1 = b1key(k);
185	*b = loadUnaligned<uint32_t>(p) & kMask[k1];
186	p += k1 + `1`;
187	size_t k2 = b2key(k);
188	*c = loadUnaligned<uint32_t>(p) & kMask[k2];
189	p += k2 + `1`;
190	size_t k3 = b3key(k);
191	*d = loadUnaligned<uint32_t>(p) & kMask[k3];
192	// p += k3+1;
193	return end;
194	}
195
196	/**
197	* Decode four uint32_t values from a buffer and store them in the array
198	* pointed-to by dest, similar to decode(p,a,b,c,d) above.
199	*/
200	static const char* decode_simple(const char* p, uint32_t* dest) {
201	return decode_simple(p, dest, dest + `1`, dest + `2`, dest + `3`);
202	}
203
204	#if FOLLY_SSE >= 3
205	/**
206	* Just like the non-SSSE3 decode below, but with the additional constraint
207	* that we must be able to read at least 17 bytes from the input pointer, p.
208	*/
209	static const char* decode(const char* p, uint32_t* dest) {
210	uint8_t key = uint8_t(p[`0`]);
211	__m128i val = _mm_loadu_si128((const __m128i*)(p + `1`));
212	__m128i mask =
213	_mm_load_si128((const __m128i*)detail::groupVarintSSEMasks[key].data());
214	__m128i r = _mm_shuffle_epi8(val, mask);
215	_mm_storeu_si128((__m128i*)dest, r);
216	return p + detail::groupVarintLengths[key];
217	}
218
219	/**
220	* Just like decode_simple, but with the additional constraint that
221	* we must be able to read at least 17 bytes from the input pointer, p.
222	*/
223	static const char*
224	decode(const char* p, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) {
225	uint8_t key = uint8_t(p[`0`]);
226	__m128i val = _mm_loadu_si128((const __m128i*)(p + `1`));
227	__m128i mask =
228	_mm_load_si128((const __m128i*)detail::groupVarintSSEMasks[key].data());
229	__m128i r = _mm_shuffle_epi8(val, mask);
230
231	// Extracting 32 bits at a time out of an XMM register is a SSE4 feature
232	#if FOLLY_SSE >= 4
233	*a = uint32_t(_mm_extract_epi32(r, `0`));
234	*b = uint32_t(_mm_extract_epi32(r, `1`));
235	*c = uint32_t(_mm_extract_epi32(r, `2`));
236	*d = uint32_t(_mm_extract_epi32(r, `3`));
237	#else /* !__SSE4__ */
238	*a = _mm_extract_epi16(r, `0`) + (_mm_extract_epi16(r, `1`) << `16`);
239	*b = _mm_extract_epi16(r, `2`) + (_mm_extract_epi16(r, `3`) << `16`);
240	*c = _mm_extract_epi16(r, `4`) + (_mm_extract_epi16(r, `5`) << `16`);
241	*d = _mm_extract_epi16(r, `6`) + (_mm_extract_epi16(r, `7`) << `16`);
242	#endif /* __SSE4__ */
243
244	return p + detail::groupVarintLengths[key];
245	}
246
247	#else /* !__SSSE3__ */
248	static const char*
249	decode(const char* p, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) {
250	return decode_simple(p, a, b, c, d);
251	}
252
253	static const char* decode(const char* p, uint32_t* dest) {
254	return decode_simple(p, dest);
255	}
256	#endif /* __SSSE3__ */
257
258	private:
259	static uint8_t key(uint32_t x) {
260	// __builtin_clz is undefined for the x==0 case
261	return uint8_t(`3` - (__builtin_clz(x \| `1`) / `8`));
262	}
263	static size_t b0key(size_t x) {
264	return x & `3`;
265	}
266	static size_t b1key(size_t x) {
267	return (x >> `2`) & `3`;
268	}
269	static size_t b2key(size_t x) {
270	return (x >> `4`) & `3`;
271	}
272	static size_t b3key(size_t x) {
273	return (x >> `6`) & `3`;
274	}
275
276	static const uint32_t kMask[];
277	};
278
279	/**
280	* GroupVarint encoding for 64-bit values.
281	*
282	* Encodes 5 64-bit integers at once, each using 1-8 bytes depending on size.
283	* There are two bytes of overhead. (The first two bytes contain the lengths
284	* of the five integers encoded as three bits each; 000=1 byte .. 111 = 8 bytes)
285	*
286	* This implementation assumes little-endian and does unaligned 64-bit
287	* accesses, so it's basically not portable outside of the x86[_64] world.
288	*/
289	template <>
290	class GroupVarint<uint64_t> : public detail::GroupVarintBase<uint64_t> {
291	public:
292	/**
293	* Return the number of bytes used to encode these five values.
294	*/
295	static size_t
296	size(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e) {
297	return kHeaderSize + kGroupSize + key(a) + key(b) + key(c) + key(d) +
298	key(e);
299	}
300
301	/**
302	* Return the number of bytes used to encode five uint64_t values stored
303	* at consecutive positions in an array.
304	*/
305	static size_t size(const uint64_t* p) {
306	return size(p[`0`], p[`1`], p[`2`], p[`3`], p[`4`]);
307	}
308
309	/**
310	* Return the number of bytes used to encode count (<= 4) values.
311	* If you clip a buffer after these many bytes, you can still decode
312	* the first "count" values correctly (if the remaining size() -
313	* partialSize() bytes are filled with garbage).
314	*/
315	static size_t partialSize(const type* p, size_t count) {
316	DCHECK_LE(count, kGroupSize);
317	size_t s = kHeaderSize + count;
318	for (; count; --count, ++p) {
319	s += key(*p);
320	}
321	return s;
322	}
323
324	/**
325	* Return the number of values from *p that are valid from an encoded
326	* buffer of size bytes.
327	*/
328	static size_t partialCount(const char* p, size_t size) {
329	uint16_t v = loadUnaligned<uint16_t>(p);
330	size_t s = kHeaderSize;
331	s += `1` + b0key(v);
332	if (s > size) {
333	return `0`;
334	}
335	s += `1` + b1key(v);
336	if (s > size) {
337	return `1`;
338	}
339	s += `1` + b2key(v);
340	if (s > size) {
341	return `2`;
342	}
343	s += `1` + b3key(v);
344	if (s > size) {
345	return `3`;
346	}
347	s += `1` + b4key(v);
348	if (s > size) {
349	return `4`;
350	}
351	return `5`;
352	}
353
354	/**
355	* Given a pointer to the beginning of an GroupVarint64-encoded block,
356	* return the number of bytes used by the encoding.
357	*/
358	static size_t encodedSize(const char* p) {
359	uint16_t n = loadUnaligned<uint16_t>(p);
360	return kHeaderSize + kGroupSize + b0key(n) + b1key(n) + b2key(n) +
361	b3key(n) + b4key(n);
362	}
363
364	/**
365	* Encode five uint64_t values into the buffer pointed-to by p, and return
366	* the next position in the buffer (that is, one character past the last
367	* encoded byte). p needs to have at least size()+8 bytes available.
368	*/
369	static char*
370	encode(char* p, uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e) {
371	uint16_t b0key = key(a);
372	uint16_t b1key = key(b);
373	uint16_t b2key = key(c);
374	uint16_t b3key = key(d);
375	uint16_t b4key = key(e);
376	storeUnaligned<uint16_t>(
377	p,
378	uint16_t(
379	(b4key << `12`) \| (b3key << `9`) \| (b2key << `6`) \| (b1key << `3`) \|
380	b0key));
381	p += `2`;
382	storeUnaligned(p, a);
383	p += b0key + `1`;
384	storeUnaligned(p, b);
385	p += b1key + `1`;
386	storeUnaligned(p, c);
387	p += b2key + `1`;
388	storeUnaligned(p, d);
389	p += b3key + `1`;
390	storeUnaligned(p, e);
391	p += b4key + `1`;
392	return p;
393	}
394
395	/**
396	* Encode five uint64_t values from the array pointed-to by src into the
397	* buffer pointed-to by p, similar to encode(p,a,b,c,d,e) above.
398	*/
399	static char* encode(char* p, const uint64_t* src) {
400	return encode(p, src[`0`], src[`1`], src[`2`], src[`3`], src[`4`]);
401	}
402
403	/**
404	* Decode five uint64_t values from a buffer, and return the next position
405	* in the buffer (that is, one character past the last encoded byte).
406	* The buffer needs to have at least 7 bytes available (they may be read
407	* but ignored).
408	*/
409	static const char* decode(
410	const char* p,
411	uint64_t* a,
412	uint64_t* b,
413	uint64_t* c,
414	uint64_t* d,
415	uint64_t* e) {
416	uint16_t k = loadUnaligned<uint16_t>(p);
417	p += `2`;
418	uint8_t k0 = b0key(k);
419	*a = loadUnaligned<uint64_t>(p) & kMask[k0];
420	p += k0 + `1`;
421	uint8_t k1 = b1key(k);
422	*b = loadUnaligned<uint64_t>(p) & kMask[k1];
423	p += k1 + `1`;
424	uint8_t k2 = b2key(k);
425	*c = loadUnaligned<uint64_t>(p) & kMask[k2];
426	p += k2 + `1`;
427	uint8_t k3 = b3key(k);
428	*d = loadUnaligned<uint64_t>(p) & kMask[k3];
429	p += k3 + `1`;
430	uint8_t k4 = b4key(k);
431	*e = loadUnaligned<uint64_t>(p) & kMask[k4];
432	p += k4 + `1`;
433	return p;
434	}
435
436	/**
437	* Decode five uint64_t values from a buffer and store them in the array
438	* pointed-to by dest, similar to decode(p,a,b,c,d,e) above.
439	*/
440	static const char* decode(const char* p, uint64_t* dest) {
441	return decode(p, dest, dest + `1`, dest + `2`, dest + `3`, dest + `4`);
442	}
443
444	private:
445	enum { kHeaderBytes = `2` };
446
447	static uint8_t key(uint64_t x) {
448	// __builtin_clzll is undefined for the x==0 case
449	return uint8_t(`7` - (__builtin_clzll(x \| `1`) / `8`));
450	}
451
452	static uint8_t b0key(uint16_t x) {
453	return x & `7u`;
454	}
455	static uint8_t b1key(uint16_t x) {
456	return (x >> `3`) & `7u`;
457	}
458	static uint8_t b2key(uint16_t x) {
459	return (x >> `6`) & `7u`;
460	}
461	static uint8_t b3key(uint16_t x) {
462	return (x >> `9`) & `7u`;
463	}
464	static uint8_t b4key(uint16_t x) {
465	return (x >> `12`) & `7u`;
466	}
467
468	static const uint64_t kMask[];
469	};
470
471	typedef GroupVarint<uint32_t> GroupVarint32;
472	typedef GroupVarint<uint64_t> GroupVarint64;
473
474	/**
475	* Simplify use of GroupVarint* for the case where data is available one
476	* entry at a time (instead of one group at a time). Handles buffering
477	* and an incomplete last chunk.
478	*
479	* Output is a function object that accepts character ranges:
480	* out(StringPiece) appends the given character range to the output.
481	*/
482	template <class T, class Output>
483	class GroupVarintEncoder {
484	public:
485	typedef GroupVarint<T> Base;
486	typedef T type;
487
488	explicit GroupVarintEncoder(Output out) : out_(out), count_(`0`) {}
489
490	~GroupVarintEncoder() {
491	finish();
492	}
493
494	/**
495	* Add a value to the encoder.
496	*/
497	void add(type val) {
498	buf_[count_++] = val;
499	if (count_ == Base::kGroupSize) {
500	char* p = Base::encode(tmp_, buf_);
501	out_(StringPiece(tmp_, p));
502	count_ = `0`;
503	}
504	}
505
506	/**
507	* Finish encoding, flushing any buffered values if necessary.
508	* After finish(), the encoder is immediately ready to encode more data
509	* to the same output.
510	*/
511	void finish() {
512	if (count_) {
513	// This is not strictly necessary, but it makes testing easy;
514	// uninitialized bytes are guaranteed to be recorded as taking one byte
515	// (not more).
516	for (size_t i = count_; i < Base::kGroupSize; i++) {
517	buf_[i] = `0`;
518	}
519	Base::encode(tmp_, buf_);
520	out_(StringPiece(tmp_, Base::partialSize(buf_, count_)));
521	count_ = `0`;
522	}
523	}
524
525	/**
526	* Return the appender that was used.
527	*/
528	Output& output() {
529	return out_;
530	}
531	const Output& output() const {
532	return out_;
533	}
534
535	/**
536	* Reset the encoder, disregarding any state (except what was already
537	* flushed to the output, of course).
538	*/
539	void clear() {
540	count_ = `0`;
541	}
542
543	private:
544	Output out_;
545	char tmp_[Base::kMaxSize];
546	type buf_[Base::kGroupSize];
547	size_t count_;
548	};
549
550	/**
551	* Simplify use of GroupVarint* for the case where the last group in the
552	* input may be incomplete (but the exact size of the input is known).
553	* Allows for extracting values one at a time.
554	*/
555	template <typename T>
556	class GroupVarintDecoder {
557	public:
558	typedef GroupVarint<T> Base;
559	typedef T type;
560
561	GroupVarintDecoder() = default;
562
563	explicit GroupVarintDecoder(StringPiece data, size_t maxCount = (size_t)-`1`)
564	: rrest_(data.end()),
565	p_(data.data()),
566	end_(data.end()),
567	limit_(end_),
568	pos_(`0`),
569	count_(`0`),
570	remaining_(maxCount) {}
571
572	void reset(StringPiece data, size_t maxCount = (size_t)-`1`) {
573	rrest_ = data.end();
574	p_ = data.data();
575	end_ = data.end();
576	limit_ = end_;
577	pos_ = `0`;
578	count_ = `0`;
579	remaining_ = maxCount;
580	}
581
582	/**
583	* Read and return the next value.
584	*/
585	bool next(type* val) {
586	if (pos_ == count_) {
587	// refill
588	size_t rem = size_t(end_ - p_);
589	if (rem == `0` \|\| remaining_ == `0`) {
590	return false;
591	}
592	// next() attempts to read one full group at a time, and so we must have
593	// at least enough bytes readable after its end to handle the case if the
594	// last group is full.
595	//
596	// The best way to ensure this is to ensure that data has at least
597	// Base::kMaxSize - 1 bytes readable after* the end, otherwise we'll copy*
598	// into a temporary buffer.
599	if (limit_ - p_ < Base::kMaxSize) {
600	memcpy(tmp_, p_, rem);
601	p_ = tmp_;
602	end_ = p_ + rem;
603	limit_ = tmp_ + sizeof(tmp_);
604	}
605	pos_ = `0`;
606	const char* n = Base::decode(p_, buf_);
607	if (n <= end_) {
608	// Full group could be decoded
609	if (remaining_ >= Base::kGroupSize) {
610	remaining_ -= Base::kGroupSize;
611	count_ = Base::kGroupSize;
612	p_ = n;
613	} else {
614	count_ = remaining_;
615	remaining_ = `0`;
616	p_ += Base::partialSize(buf_, count_);
617	}
618	} else {
619	// Can't decode a full group
620	count_ = Base::partialCount(p_, size_t(end_ - p_));
621	if (remaining_ >= count_) {
622	remaining_ -= count_;
623	p_ = end_;
624	} else {
625	count_ = remaining_;
626	remaining_ = `0`;
627	p_ += Base::partialSize(buf_, count_);
628	}
629	if (count_ == `0`) {
630	return false;
631	}
632	}
633	}
634	*val = buf_[pos_++];
635	return true;
636	}
637
638	StringPiece rest() const {
639	// This is only valid after next() returned false
640	CHECK(pos_ == count_ && (p_ == end_ \|\| remaining_ == `0`));
641	// p_ may point to the internal buffer (tmp_), but we want
642	// to return subpiece of the original data
643	size_t size = size_t(end_ - p_);
644	return StringPiece(rrest_ - size, rrest_);
645	}
646
647	private:
648	const char* rrest_;
649	const char* p_;
650	const char* end_;
651	const char* limit_;
652	char tmp_[`2` * Base::kMaxSize];
653	type buf_[Base::kGroupSize];
654	size_t pos_;
655	size_t count_;
656	size_t remaining_;
657	};
658
659	typedef GroupVarintDecoder<uint32_t> GroupVarint32Decoder;
660	typedef GroupVarintDecoder<uint64_t> GroupVarint64Decoder;
661
662	} // namespace folly
663
664	#endif /* FOLLY_X64 \|\| defined(__i386__) \|\| FOLLY_PPC64 */
665

Browse the source code of folly/GroupVarint.h