1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include <gtest/gtest.h>
19#include <cstdint>
20#include <cstdlib>
21#include <cstring>
22#include <string>
23#include <vector>
24
25#include "arrow/util/bit-util.h"
26
27#include "parquet/encoding-internal.h"
28#include "parquet/schema.h"
29#include "parquet/types.h"
30#include "parquet/util/memory.h"
31#include "parquet/util/test-common.h"
32
33using arrow::default_memory_pool;
34using arrow::MemoryPool;
35
36using std::string;
37using std::vector;
38
39namespace parquet {
40
41namespace test {
42
43TEST(VectorBooleanTest, TestEncodeDecode) {
44 // PARQUET-454
45 int nvalues = 10000;
46 int nbytes = static_cast<int>(BitUtil::BytesForBits(nvalues));
47
48 // seed the prng so failure is deterministic
49 vector<bool> draws = flip_coins_seed(nvalues, 0.5, 0);
50
51 PlainEncoder<BooleanType> encoder(nullptr);
52 PlainDecoder<BooleanType> decoder(nullptr);
53
54 encoder.Put(draws, nvalues);
55
56 std::shared_ptr<Buffer> encode_buffer = encoder.FlushValues();
57 ASSERT_EQ(nbytes, encode_buffer->size());
58
59 vector<uint8_t> decode_buffer(nbytes);
60 const uint8_t* decode_data = &decode_buffer[0];
61
62 decoder.SetData(nvalues, encode_buffer->data(),
63 static_cast<int>(encode_buffer->size()));
64 int values_decoded = decoder.Decode(&decode_buffer[0], nvalues);
65 ASSERT_EQ(nvalues, values_decoded);
66
67 for (int i = 0; i < nvalues; ++i) {
68 ASSERT_EQ(draws[i], BitUtil::GetBit(decode_data, i)) << i;
69 }
70}
71
72// ----------------------------------------------------------------------
73// test data generation
74
75template <typename T>
76void GenerateData(int num_values, T* out, vector<uint8_t>* heap) {
77 // seed the prng so failure is deterministic
78 random_numbers(num_values, 0, std::numeric_limits<T>::min(),
79 std::numeric_limits<T>::max(), out);
80}
81
82template <>
83void GenerateData<bool>(int num_values, bool* out, vector<uint8_t>* heap) {
84 // seed the prng so failure is deterministic
85 random_bools(num_values, 0.5, 0, out);
86}
87
88template <>
89void GenerateData<Int96>(int num_values, Int96* out, vector<uint8_t>* heap) {
90 // seed the prng so failure is deterministic
91 random_Int96_numbers(num_values, 0, std::numeric_limits<int32_t>::min(),
92 std::numeric_limits<int32_t>::max(), out);
93}
94
95template <>
96void GenerateData<ByteArray>(int num_values, ByteArray* out, vector<uint8_t>* heap) {
97 // seed the prng so failure is deterministic
98 int max_byte_array_len = 12;
99 heap->resize(num_values * max_byte_array_len);
100 random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len);
101}
102
103static int flba_length = 8;
104
105template <>
106void GenerateData<FLBA>(int num_values, FLBA* out, vector<uint8_t>* heap) {
107 // seed the prng so failure is deterministic
108 heap->resize(num_values * flba_length);
109 random_fixed_byte_array(num_values, 0, heap->data(), flba_length, out);
110}
111
112template <typename T>
113void VerifyResults(T* result, T* expected, int num_values) {
114 for (int i = 0; i < num_values; ++i) {
115 ASSERT_EQ(expected[i], result[i]) << i;
116 }
117}
118
119template <>
120void VerifyResults<FLBA>(FLBA* result, FLBA* expected, int num_values) {
121 for (int i = 0; i < num_values; ++i) {
122 ASSERT_EQ(0, memcmp(expected[i].ptr, result[i].ptr, flba_length)) << i;
123 }
124}
125
126// ----------------------------------------------------------------------
127// Create some column descriptors
128
129template <typename DType>
130std::shared_ptr<ColumnDescriptor> ExampleDescr() {
131 auto node = schema::PrimitiveNode::Make("name", Repetition::OPTIONAL, DType::type_num);
132 return std::make_shared<ColumnDescriptor>(node, 0, 0);
133}
134
135template <>
136std::shared_ptr<ColumnDescriptor> ExampleDescr<FLBAType>() {
137 auto node = schema::PrimitiveNode::Make("name", Repetition::OPTIONAL,
138 Type::FIXED_LEN_BYTE_ARRAY,
139 LogicalType::DECIMAL, flba_length, 10, 2);
140 return std::make_shared<ColumnDescriptor>(node, 0, 0);
141}
142
143// ----------------------------------------------------------------------
144// Plain encoding tests
145
146template <typename Type>
147class TestEncodingBase : public ::testing::Test {
148 public:
149 typedef typename Type::c_type T;
150 static constexpr int TYPE = Type::type_num;
151
152 void SetUp() {
153 descr_ = ExampleDescr<Type>();
154 type_length_ = descr_->type_length();
155 allocator_ = default_memory_pool();
156 }
157
158 void TearDown() {}
159
160 void InitData(int nvalues, int repeats) {
161 num_values_ = nvalues * repeats;
162 input_bytes_.resize(num_values_ * sizeof(T));
163 output_bytes_.resize(num_values_ * sizeof(T));
164 draws_ = reinterpret_cast<T*>(input_bytes_.data());
165 decode_buf_ = reinterpret_cast<T*>(output_bytes_.data());
166 GenerateData<T>(nvalues, draws_, &data_buffer_);
167
168 // add some repeated values
169 for (int j = 1; j < repeats; ++j) {
170 for (int i = 0; i < nvalues; ++i) {
171 draws_[nvalues * j + i] = draws_[i];
172 }
173 }
174 }
175
176 virtual void CheckRoundtrip() = 0;
177
178 void Execute(int nvalues, int repeats) {
179 InitData(nvalues, repeats);
180 CheckRoundtrip();
181 }
182
183 protected:
184 MemoryPool* allocator_;
185
186 int num_values_;
187 int type_length_;
188 T* draws_;
189 T* decode_buf_;
190 vector<uint8_t> input_bytes_;
191 vector<uint8_t> output_bytes_;
192 vector<uint8_t> data_buffer_;
193
194 std::shared_ptr<Buffer> encode_buffer_;
195 std::shared_ptr<ColumnDescriptor> descr_;
196};
197
198// Member variables are not visible to templated subclasses. Possibly figure
199// out an alternative to this class layering at some point
200#define USING_BASE_MEMBERS() \
201 using TestEncodingBase<Type>::allocator_; \
202 using TestEncodingBase<Type>::descr_; \
203 using TestEncodingBase<Type>::num_values_; \
204 using TestEncodingBase<Type>::draws_; \
205 using TestEncodingBase<Type>::data_buffer_; \
206 using TestEncodingBase<Type>::type_length_; \
207 using TestEncodingBase<Type>::encode_buffer_; \
208 using TestEncodingBase<Type>::decode_buf_
209
210template <typename Type>
211class TestPlainEncoding : public TestEncodingBase<Type> {
212 public:
213 typedef typename Type::c_type T;
214 static constexpr int TYPE = Type::type_num;
215
216 virtual void CheckRoundtrip() {
217 PlainEncoder<Type> encoder(descr_.get());
218 PlainDecoder<Type> decoder(descr_.get());
219 encoder.Put(draws_, num_values_);
220 encode_buffer_ = encoder.FlushValues();
221
222 decoder.SetData(num_values_, encode_buffer_->data(),
223 static_cast<int>(encode_buffer_->size()));
224 int values_decoded = decoder.Decode(decode_buf_, num_values_);
225 ASSERT_EQ(num_values_, values_decoded);
226 ASSERT_NO_FATAL_FAILURE(VerifyResults<T>(decode_buf_, draws_, num_values_));
227 }
228
229 protected:
230 USING_BASE_MEMBERS();
231};
232
233TYPED_TEST_CASE(TestPlainEncoding, ParquetTypes);
234
235TYPED_TEST(TestPlainEncoding, BasicRoundTrip) {
236 ASSERT_NO_FATAL_FAILURE(this->Execute(10000, 1));
237}
238
239// ----------------------------------------------------------------------
240// Dictionary encoding tests
241
242typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,
243 ByteArrayType, FLBAType>
244 DictEncodedTypes;
245
246template <typename Type>
247class TestDictionaryEncoding : public TestEncodingBase<Type> {
248 public:
249 typedef typename Type::c_type T;
250 static constexpr int TYPE = Type::type_num;
251
252 void CheckRoundtrip() {
253 std::vector<uint8_t> valid_bits(BitUtil::BytesForBits(num_values_) + 1, 255);
254 DictEncoder<Type> encoder(descr_.get());
255
256 ASSERT_NO_THROW(encoder.Put(draws_, num_values_));
257 dict_buffer_ = AllocateBuffer(default_memory_pool(), encoder.dict_encoded_size());
258 encoder.WriteDict(dict_buffer_->mutable_data());
259 std::shared_ptr<Buffer> indices = encoder.FlushValues();
260
261 DictEncoder<Type> spaced_encoder(descr_.get());
262 // PutSpaced should lead to the same results
263 ASSERT_NO_THROW(spaced_encoder.PutSpaced(draws_, num_values_, valid_bits.data(), 0));
264 std::shared_ptr<Buffer> indices_from_spaced = spaced_encoder.FlushValues();
265 ASSERT_TRUE(indices_from_spaced->Equals(*indices));
266
267 PlainDecoder<Type> dict_decoder(descr_.get());
268 dict_decoder.SetData(encoder.num_entries(), dict_buffer_->data(),
269 static_cast<int>(dict_buffer_->size()));
270
271 DictionaryDecoder<Type> decoder(descr_.get());
272 decoder.SetDict(&dict_decoder);
273
274 decoder.SetData(num_values_, indices->data(), static_cast<int>(indices->size()));
275 int values_decoded = decoder.Decode(decode_buf_, num_values_);
276 ASSERT_EQ(num_values_, values_decoded);
277
278 // TODO(wesm): The DictionaryDecoder must stay alive because the decoded
279 // values' data is owned by a buffer inside the DictionaryEncoder. We
280 // should revisit when data lifetime is reviewed more generally.
281 ASSERT_NO_FATAL_FAILURE(VerifyResults<T>(decode_buf_, draws_, num_values_));
282
283 // Also test spaced decoding
284 decoder.SetData(num_values_, indices->data(), static_cast<int>(indices->size()));
285 values_decoded =
286 decoder.DecodeSpaced(decode_buf_, num_values_, 0, valid_bits.data(), 0);
287 ASSERT_EQ(num_values_, values_decoded);
288 ASSERT_NO_FATAL_FAILURE(VerifyResults<T>(decode_buf_, draws_, num_values_));
289 }
290
291 protected:
292 USING_BASE_MEMBERS();
293 std::shared_ptr<ResizableBuffer> dict_buffer_;
294};
295
296TYPED_TEST_CASE(TestDictionaryEncoding, DictEncodedTypes);
297
298TYPED_TEST(TestDictionaryEncoding, BasicRoundTrip) {
299 ASSERT_NO_FATAL_FAILURE(this->Execute(2500, 2));
300}
301
302TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) {
303 PlainDecoder<BooleanType> dict_decoder(nullptr);
304 DictionaryDecoder<BooleanType> decoder(nullptr);
305
306 ASSERT_THROW(decoder.SetDict(&dict_decoder), ParquetException);
307}
308
309} // namespace test
310
311} // namespace parquet
312