1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18// From Apache Impala (incubating) as of 2016-01-29
19
20#include <cstdint>
21#include <cstring>
22#include <random>
23#include <vector>
24
25#include <gtest/gtest.h>
26
27#include <boost/utility.hpp> // IWYU pragma: export
28
29#include "arrow/util/bit-stream-utils.h"
30#include "arrow/util/bit-util.h"
31#include "arrow/util/rle-encoding.h"
32
33using std::vector;
34
35namespace arrow {
36namespace util {
37
38const int MAX_WIDTH = 32;
39
40TEST(BitArray, TestBool) {
41 const int len = 8;
42 uint8_t buffer[len];
43
44 BitUtil::BitWriter writer(buffer, len);
45
46 // Write alternating 0's and 1's
47 for (int i = 0; i < 8; ++i) {
48 bool result = writer.PutValue(i % 2, 1);
49 EXPECT_TRUE(result);
50 }
51 writer.Flush();
52 EXPECT_EQ((int)buffer[0], BOOST_BINARY(1 0 1 0 1 0 1 0));
53
54 // Write 00110011
55 for (int i = 0; i < 8; ++i) {
56 bool result = false;
57 switch (i) {
58 case 0:
59 case 1:
60 case 4:
61 case 5:
62 result = writer.PutValue(false, 1);
63 break;
64 default:
65 result = writer.PutValue(true, 1);
66 break;
67 }
68 EXPECT_TRUE(result);
69 }
70 writer.Flush();
71
72 // Validate the exact bit value
73 EXPECT_EQ((int)buffer[0], BOOST_BINARY(1 0 1 0 1 0 1 0));
74 EXPECT_EQ((int)buffer[1], BOOST_BINARY(1 1 0 0 1 1 0 0));
75
76 // Use the reader and validate
77 BitUtil::BitReader reader(buffer, len);
78 for (int i = 0; i < 8; ++i) {
79 bool val = false;
80 bool result = reader.GetValue(1, &val);
81 EXPECT_TRUE(result);
82 EXPECT_EQ(val, (i % 2) != 0);
83 }
84
85 for (int i = 0; i < 8; ++i) {
86 bool val = false;
87 bool result = reader.GetValue(1, &val);
88 EXPECT_TRUE(result);
89 switch (i) {
90 case 0:
91 case 1:
92 case 4:
93 case 5:
94 EXPECT_EQ(val, false);
95 break;
96 default:
97 EXPECT_EQ(val, true);
98 break;
99 }
100 }
101}
102
103// Writes 'num_vals' values with width 'bit_width' and reads them back.
104void TestBitArrayValues(int bit_width, int num_vals) {
105 int len = static_cast<int>(BitUtil::BytesForBits(bit_width * num_vals));
106 EXPECT_GT(len, 0);
107 const uint64_t mod = bit_width == 64 ? 1 : 1LL << bit_width;
108
109 std::vector<uint8_t> buffer(len);
110 BitUtil::BitWriter writer(buffer.data(), len);
111 for (int i = 0; i < num_vals; ++i) {
112 bool result = writer.PutValue(i % mod, bit_width);
113 EXPECT_TRUE(result);
114 }
115 writer.Flush();
116 EXPECT_EQ(writer.bytes_written(), len);
117
118 BitUtil::BitReader reader(buffer.data(), len);
119 for (int i = 0; i < num_vals; ++i) {
120 int64_t val = 0;
121 bool result = reader.GetValue(bit_width, &val);
122 EXPECT_TRUE(result);
123 EXPECT_EQ(val, i % mod);
124 }
125 EXPECT_EQ(reader.bytes_left(), 0);
126}
127
128TEST(BitArray, TestValues) {
129 for (int width = 1; width <= MAX_WIDTH; ++width) {
130 TestBitArrayValues(width, 1);
131 TestBitArrayValues(width, 2);
132 // Don't write too many values
133 TestBitArrayValues(width, (width < 12) ? (1 << width) : 4096);
134 TestBitArrayValues(width, 1024);
135 }
136}
137
138// Test some mixed values
139TEST(BitArray, TestMixed) {
140 const int len = 1024;
141 uint8_t buffer[len];
142 bool parity = true;
143
144 BitUtil::BitWriter writer(buffer, len);
145 for (int i = 0; i < len; ++i) {
146 bool result;
147 if (i % 2 == 0) {
148 result = writer.PutValue(parity, 1);
149 parity = !parity;
150 } else {
151 result = writer.PutValue(i, 10);
152 }
153 EXPECT_TRUE(result);
154 }
155 writer.Flush();
156
157 parity = true;
158 BitUtil::BitReader reader(buffer, len);
159 for (int i = 0; i < len; ++i) {
160 bool result;
161 if (i % 2 == 0) {
162 bool val;
163 result = reader.GetValue(1, &val);
164 EXPECT_EQ(val, parity);
165 parity = !parity;
166 } else {
167 int val;
168 result = reader.GetValue(10, &val);
169 EXPECT_EQ(val, i);
170 }
171 EXPECT_TRUE(result);
172 }
173}
174
175// Validates encoding of values by encoding and decoding them. If
176// expected_encoding != NULL, also validates that the encoded buffer is
177// exactly 'expected_encoding'.
178// if expected_len is not -1, it will validate the encoded size is correct.
179void ValidateRle(const vector<int>& values, int bit_width, uint8_t* expected_encoding,
180 int expected_len) {
181 const int len = 64 * 1024;
182 uint8_t buffer[len];
183 EXPECT_LE(expected_len, len);
184
185 RleEncoder encoder(buffer, len, bit_width);
186 for (size_t i = 0; i < values.size(); ++i) {
187 bool result = encoder.Put(values[i]);
188 EXPECT_TRUE(result);
189 }
190 int encoded_len = encoder.Flush();
191
192 if (expected_len != -1) {
193 EXPECT_EQ(encoded_len, expected_len);
194 }
195 if (expected_encoding != NULL) {
196 EXPECT_EQ(memcmp(buffer, expected_encoding, encoded_len), 0);
197 }
198
199 // Verify read
200 {
201 RleDecoder decoder(buffer, len, bit_width);
202 for (size_t i = 0; i < values.size(); ++i) {
203 uint64_t val;
204 bool result = decoder.Get(&val);
205 EXPECT_TRUE(result);
206 EXPECT_EQ(values[i], val);
207 }
208 }
209
210 // Verify batch read
211 {
212 RleDecoder decoder(buffer, len, bit_width);
213 vector<int> values_read(values.size());
214 ASSERT_EQ(values.size(),
215 decoder.GetBatch(values_read.data(), static_cast<int>(values.size())));
216 EXPECT_EQ(values, values_read);
217 }
218}
219
220// A version of ValidateRle that round-trips the values and returns false if
221// the returned values are not all the same
222bool CheckRoundTrip(const vector<int>& values, int bit_width) {
223 const int len = 64 * 1024;
224 uint8_t buffer[len];
225 RleEncoder encoder(buffer, len, bit_width);
226 for (size_t i = 0; i < values.size(); ++i) {
227 bool result = encoder.Put(values[i]);
228 if (!result) {
229 return false;
230 }
231 }
232 int encoded_len = encoder.Flush();
233 int out = 0;
234
235 {
236 RleDecoder decoder(buffer, encoded_len, bit_width);
237 for (size_t i = 0; i < values.size(); ++i) {
238 EXPECT_TRUE(decoder.Get(&out));
239 if (values[i] != out) {
240 return false;
241 }
242 }
243 }
244
245 // Verify batch read
246 {
247 RleDecoder decoder(buffer, len, bit_width);
248 vector<int> values_read(values.size());
249 if (static_cast<int>(values.size()) !=
250 decoder.GetBatch(values_read.data(), static_cast<int>(values.size()))) {
251 return false;
252 }
253 if (values != values_read) {
254 return false;
255 }
256 }
257
258 return true;
259}
260
261TEST(Rle, SpecificSequences) {
262 const int len = 1024;
263 uint8_t expected_buffer[len];
264 vector<int> values;
265
266 // Test 50 0' followed by 50 1's
267 values.resize(100);
268 for (int i = 0; i < 50; ++i) {
269 values[i] = 0;
270 }
271 for (int i = 50; i < 100; ++i) {
272 values[i] = 1;
273 }
274
275 // expected_buffer valid for bit width <= 1 byte
276 expected_buffer[0] = (50 << 1);
277 expected_buffer[1] = 0;
278 expected_buffer[2] = (50 << 1);
279 expected_buffer[3] = 1;
280 for (int width = 1; width <= 8; ++width) {
281 ValidateRle(values, width, expected_buffer, 4);
282 }
283
284 for (int width = 9; width <= MAX_WIDTH; ++width) {
285 ValidateRle(values, width, NULL,
286 2 * (1 + static_cast<int>(BitUtil::CeilDiv(width, 8))));
287 }
288
289 // Test 100 0's and 1's alternating
290 for (int i = 0; i < 100; ++i) {
291 values[i] = i % 2;
292 }
293 int num_groups = static_cast<int>(BitUtil::CeilDiv(100, 8));
294 expected_buffer[0] = static_cast<uint8_t>((num_groups << 1) | 1);
295 for (int i = 1; i <= 100 / 8; ++i) {
296 expected_buffer[i] = BOOST_BINARY(1 0 1 0 1 0 1 0);
297 }
298 // Values for the last 4 0 and 1's. The upper 4 bits should be padded to 0.
299 expected_buffer[100 / 8 + 1] = BOOST_BINARY(0 0 0 0 1 0 1 0);
300
301 // num_groups and expected_buffer only valid for bit width = 1
302 ValidateRle(values, 1, expected_buffer, 1 + num_groups);
303 for (int width = 2; width <= MAX_WIDTH; ++width) {
304 int num_values = static_cast<int>(BitUtil::CeilDiv(100, 8)) * 8;
305 ValidateRle(values, width, NULL,
306 1 + static_cast<int>(BitUtil::CeilDiv(width * num_values, 8)));
307 }
308}
309
310// ValidateRle on 'num_vals' values with width 'bit_width'. If 'value' != -1, that value
311// is used, otherwise alternating values are used.
312void TestRleValues(int bit_width, int num_vals, int value = -1) {
313 const uint64_t mod = (bit_width == 64) ? 1 : 1LL << bit_width;
314 vector<int> values;
315 for (int v = 0; v < num_vals; ++v) {
316 values.push_back((value != -1) ? value : static_cast<int>(v % mod));
317 }
318 ValidateRle(values, bit_width, NULL, -1);
319}
320
321TEST(Rle, TestValues) {
322 for (int width = 1; width <= MAX_WIDTH; ++width) {
323 TestRleValues(width, 1);
324 TestRleValues(width, 1024);
325 TestRleValues(width, 1024, 0);
326 TestRleValues(width, 1024, 1);
327 }
328}
329
330TEST(Rle, BitWidthZeroRepeated) {
331 uint8_t buffer[1];
332 const int num_values = 15;
333 buffer[0] = num_values << 1; // repeated indicator byte
334 RleDecoder decoder(buffer, sizeof(buffer), 0);
335 uint8_t val;
336 for (int i = 0; i < num_values; ++i) {
337 bool result = decoder.Get(&val);
338 EXPECT_TRUE(result);
339 EXPECT_EQ(val, 0); // can only encode 0s with bit width 0
340 }
341 EXPECT_FALSE(decoder.Get(&val));
342}
343
344TEST(Rle, BitWidthZeroLiteral) {
345 uint8_t buffer[1];
346 const int num_groups = 4;
347 buffer[0] = num_groups << 1 | 1; // literal indicator byte
348 RleDecoder decoder = RleDecoder(buffer, sizeof(buffer), 0);
349 const int num_values = num_groups * 8;
350 uint8_t val;
351 for (int i = 0; i < num_values; ++i) {
352 bool result = decoder.Get(&val);
353 EXPECT_TRUE(result);
354 EXPECT_EQ(val, 0); // can only encode 0s with bit width 0
355 }
356 EXPECT_FALSE(decoder.Get(&val));
357}
358
359// Test that writes out a repeated group and then a literal
360// group but flush before finishing.
361TEST(BitRle, Flush) {
362 vector<int> values;
363 for (int i = 0; i < 16; ++i) values.push_back(1);
364 values.push_back(0);
365 ValidateRle(values, 1, NULL, -1);
366 values.push_back(1);
367 ValidateRle(values, 1, NULL, -1);
368 values.push_back(1);
369 ValidateRle(values, 1, NULL, -1);
370 values.push_back(1);
371 ValidateRle(values, 1, NULL, -1);
372}
373
374// Test some random sequences.
375TEST(BitRle, Random) {
376 int niters = 50;
377 int ngroups = 1000;
378 int max_group_size = 16;
379 vector<int> values(ngroups + max_group_size);
380
381 // prng setup
382 std::random_device rd;
383 std::uniform_int_distribution<int> dist(1, 20);
384
385 for (int iter = 0; iter < niters; ++iter) {
386 // generate a seed with device entropy
387 uint32_t seed = rd();
388 std::mt19937 gen(seed);
389
390 bool parity = 0;
391 values.resize(0);
392
393 for (int i = 0; i < ngroups; ++i) {
394 int group_size = dist(gen);
395 if (group_size > max_group_size) {
396 group_size = 1;
397 }
398 for (int i = 0; i < group_size; ++i) {
399 values.push_back(parity);
400 }
401 parity = !parity;
402 }
403 if (!CheckRoundTrip(values, BitUtil::NumRequiredBits(values.size()))) {
404 FAIL() << "failing seed: " << seed;
405 }
406 }
407}
408
409// Test a sequence of 1 0's, 2 1's, 3 0's. etc
410// e.g. 011000111100000
411TEST(BitRle, RepeatedPattern) {
412 vector<int> values;
413 const int min_run = 1;
414 const int max_run = 32;
415
416 for (int i = min_run; i <= max_run; ++i) {
417 int v = i % 2;
418 for (int j = 0; j < i; ++j) {
419 values.push_back(v);
420 }
421 }
422
423 // And go back down again
424 for (int i = max_run; i >= min_run; --i) {
425 int v = i % 2;
426 for (int j = 0; j < i; ++j) {
427 values.push_back(v);
428 }
429 }
430
431 ValidateRle(values, 1, NULL, -1);
432}
433
434TEST(BitRle, Overflow) {
435 for (int bit_width = 1; bit_width < 32; bit_width += 3) {
436 int len = RleEncoder::MinBufferSize(bit_width);
437 std::vector<uint8_t> buffer(len);
438 int num_added = 0;
439 bool parity = true;
440
441 RleEncoder encoder(buffer.data(), len, bit_width);
442 // Insert alternating true/false until there is no space left
443 while (true) {
444 bool result = encoder.Put(parity);
445 parity = !parity;
446 if (!result) break;
447 ++num_added;
448 }
449
450 int bytes_written = encoder.Flush();
451 EXPECT_LE(bytes_written, len);
452 EXPECT_GT(num_added, 0);
453
454 RleDecoder decoder(buffer.data(), bytes_written, bit_width);
455 parity = true;
456 uint32_t v;
457 for (int i = 0; i < num_added; ++i) {
458 bool result = decoder.Get(&v);
459 EXPECT_TRUE(result);
460 EXPECT_EQ(v != 0, parity);
461 parity = !parity;
462 }
463 // Make sure we get false when reading past end a couple times.
464 EXPECT_FALSE(decoder.Get(&v));
465 EXPECT_FALSE(decoder.Get(&v));
466 }
467}
468
469} // namespace util
470} // namespace arrow
471