1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include <gtest/gtest.h> |
19 | #include <cstdint> |
20 | #include <cstdlib> |
21 | #include <cstring> |
22 | #include <string> |
23 | #include <vector> |
24 | |
25 | #include "arrow/util/bit-util.h" |
26 | |
27 | #include "parquet/encoding-internal.h" |
28 | #include "parquet/schema.h" |
29 | #include "parquet/types.h" |
30 | #include "parquet/util/memory.h" |
31 | #include "parquet/util/test-common.h" |
32 | |
33 | using arrow::default_memory_pool; |
34 | using arrow::MemoryPool; |
35 | |
36 | using std::string; |
37 | using std::vector; |
38 | |
39 | namespace parquet { |
40 | |
41 | namespace test { |
42 | |
43 | TEST(VectorBooleanTest, TestEncodeDecode) { |
44 | // PARQUET-454 |
45 | int nvalues = 10000; |
46 | int nbytes = static_cast<int>(BitUtil::BytesForBits(nvalues)); |
47 | |
48 | // seed the prng so failure is deterministic |
49 | vector<bool> draws = flip_coins_seed(nvalues, 0.5, 0); |
50 | |
51 | PlainEncoder<BooleanType> encoder(nullptr); |
52 | PlainDecoder<BooleanType> decoder(nullptr); |
53 | |
54 | encoder.Put(draws, nvalues); |
55 | |
56 | std::shared_ptr<Buffer> encode_buffer = encoder.FlushValues(); |
57 | ASSERT_EQ(nbytes, encode_buffer->size()); |
58 | |
59 | vector<uint8_t> decode_buffer(nbytes); |
60 | const uint8_t* decode_data = &decode_buffer[0]; |
61 | |
62 | decoder.SetData(nvalues, encode_buffer->data(), |
63 | static_cast<int>(encode_buffer->size())); |
64 | int values_decoded = decoder.Decode(&decode_buffer[0], nvalues); |
65 | ASSERT_EQ(nvalues, values_decoded); |
66 | |
67 | for (int i = 0; i < nvalues; ++i) { |
68 | ASSERT_EQ(draws[i], BitUtil::GetBit(decode_data, i)) << i; |
69 | } |
70 | } |
71 | |
72 | // ---------------------------------------------------------------------- |
73 | // test data generation |
74 | |
75 | template <typename T> |
76 | void GenerateData(int num_values, T* out, vector<uint8_t>* heap) { |
77 | // seed the prng so failure is deterministic |
78 | random_numbers(num_values, 0, std::numeric_limits<T>::min(), |
79 | std::numeric_limits<T>::max(), out); |
80 | } |
81 | |
82 | template <> |
83 | void GenerateData<bool>(int num_values, bool* out, vector<uint8_t>* heap) { |
84 | // seed the prng so failure is deterministic |
85 | random_bools(num_values, 0.5, 0, out); |
86 | } |
87 | |
88 | template <> |
89 | void GenerateData<Int96>(int num_values, Int96* out, vector<uint8_t>* heap) { |
90 | // seed the prng so failure is deterministic |
91 | random_Int96_numbers(num_values, 0, std::numeric_limits<int32_t>::min(), |
92 | std::numeric_limits<int32_t>::max(), out); |
93 | } |
94 | |
95 | template <> |
96 | void GenerateData<ByteArray>(int num_values, ByteArray* out, vector<uint8_t>* heap) { |
97 | // seed the prng so failure is deterministic |
98 | int max_byte_array_len = 12; |
99 | heap->resize(num_values * max_byte_array_len); |
100 | random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len); |
101 | } |
102 | |
103 | static int flba_length = 8; |
104 | |
105 | template <> |
106 | void GenerateData<FLBA>(int num_values, FLBA* out, vector<uint8_t>* heap) { |
107 | // seed the prng so failure is deterministic |
108 | heap->resize(num_values * flba_length); |
109 | random_fixed_byte_array(num_values, 0, heap->data(), flba_length, out); |
110 | } |
111 | |
112 | template <typename T> |
113 | void VerifyResults(T* result, T* expected, int num_values) { |
114 | for (int i = 0; i < num_values; ++i) { |
115 | ASSERT_EQ(expected[i], result[i]) << i; |
116 | } |
117 | } |
118 | |
119 | template <> |
120 | void VerifyResults<FLBA>(FLBA* result, FLBA* expected, int num_values) { |
121 | for (int i = 0; i < num_values; ++i) { |
122 | ASSERT_EQ(0, memcmp(expected[i].ptr, result[i].ptr, flba_length)) << i; |
123 | } |
124 | } |
125 | |
126 | // ---------------------------------------------------------------------- |
127 | // Create some column descriptors |
128 | |
129 | template <typename DType> |
130 | std::shared_ptr<ColumnDescriptor> ExampleDescr() { |
131 | auto node = schema::PrimitiveNode::Make("name" , Repetition::OPTIONAL, DType::type_num); |
132 | return std::make_shared<ColumnDescriptor>(node, 0, 0); |
133 | } |
134 | |
135 | template <> |
136 | std::shared_ptr<ColumnDescriptor> ExampleDescr<FLBAType>() { |
137 | auto node = schema::PrimitiveNode::Make("name" , Repetition::OPTIONAL, |
138 | Type::FIXED_LEN_BYTE_ARRAY, |
139 | LogicalType::DECIMAL, flba_length, 10, 2); |
140 | return std::make_shared<ColumnDescriptor>(node, 0, 0); |
141 | } |
142 | |
143 | // ---------------------------------------------------------------------- |
144 | // Plain encoding tests |
145 | |
146 | template <typename Type> |
147 | class TestEncodingBase : public ::testing::Test { |
148 | public: |
149 | typedef typename Type::c_type T; |
150 | static constexpr int TYPE = Type::type_num; |
151 | |
152 | void SetUp() { |
153 | descr_ = ExampleDescr<Type>(); |
154 | type_length_ = descr_->type_length(); |
155 | allocator_ = default_memory_pool(); |
156 | } |
157 | |
158 | void TearDown() {} |
159 | |
160 | void InitData(int nvalues, int repeats) { |
161 | num_values_ = nvalues * repeats; |
162 | input_bytes_.resize(num_values_ * sizeof(T)); |
163 | output_bytes_.resize(num_values_ * sizeof(T)); |
164 | draws_ = reinterpret_cast<T*>(input_bytes_.data()); |
165 | decode_buf_ = reinterpret_cast<T*>(output_bytes_.data()); |
166 | GenerateData<T>(nvalues, draws_, &data_buffer_); |
167 | |
168 | // add some repeated values |
169 | for (int j = 1; j < repeats; ++j) { |
170 | for (int i = 0; i < nvalues; ++i) { |
171 | draws_[nvalues * j + i] = draws_[i]; |
172 | } |
173 | } |
174 | } |
175 | |
176 | virtual void CheckRoundtrip() = 0; |
177 | |
178 | void Execute(int nvalues, int repeats) { |
179 | InitData(nvalues, repeats); |
180 | CheckRoundtrip(); |
181 | } |
182 | |
183 | protected: |
184 | MemoryPool* allocator_; |
185 | |
186 | int num_values_; |
187 | int type_length_; |
188 | T* draws_; |
189 | T* decode_buf_; |
190 | vector<uint8_t> input_bytes_; |
191 | vector<uint8_t> output_bytes_; |
192 | vector<uint8_t> data_buffer_; |
193 | |
194 | std::shared_ptr<Buffer> encode_buffer_; |
195 | std::shared_ptr<ColumnDescriptor> descr_; |
196 | }; |
197 | |
198 | // Member variables are not visible to templated subclasses. Possibly figure |
199 | // out an alternative to this class layering at some point |
200 | #define USING_BASE_MEMBERS() \ |
201 | using TestEncodingBase<Type>::allocator_; \ |
202 | using TestEncodingBase<Type>::descr_; \ |
203 | using TestEncodingBase<Type>::num_values_; \ |
204 | using TestEncodingBase<Type>::draws_; \ |
205 | using TestEncodingBase<Type>::data_buffer_; \ |
206 | using TestEncodingBase<Type>::type_length_; \ |
207 | using TestEncodingBase<Type>::encode_buffer_; \ |
208 | using TestEncodingBase<Type>::decode_buf_ |
209 | |
210 | template <typename Type> |
211 | class TestPlainEncoding : public TestEncodingBase<Type> { |
212 | public: |
213 | typedef typename Type::c_type T; |
214 | static constexpr int TYPE = Type::type_num; |
215 | |
216 | virtual void CheckRoundtrip() { |
217 | PlainEncoder<Type> encoder(descr_.get()); |
218 | PlainDecoder<Type> decoder(descr_.get()); |
219 | encoder.Put(draws_, num_values_); |
220 | encode_buffer_ = encoder.FlushValues(); |
221 | |
222 | decoder.SetData(num_values_, encode_buffer_->data(), |
223 | static_cast<int>(encode_buffer_->size())); |
224 | int values_decoded = decoder.Decode(decode_buf_, num_values_); |
225 | ASSERT_EQ(num_values_, values_decoded); |
226 | ASSERT_NO_FATAL_FAILURE(VerifyResults<T>(decode_buf_, draws_, num_values_)); |
227 | } |
228 | |
229 | protected: |
230 | USING_BASE_MEMBERS(); |
231 | }; |
232 | |
233 | TYPED_TEST_CASE(TestPlainEncoding, ParquetTypes); |
234 | |
235 | TYPED_TEST(TestPlainEncoding, BasicRoundTrip) { |
236 | ASSERT_NO_FATAL_FAILURE(this->Execute(10000, 1)); |
237 | } |
238 | |
239 | // ---------------------------------------------------------------------- |
240 | // Dictionary encoding tests |
241 | |
242 | typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType, |
243 | ByteArrayType, FLBAType> |
244 | DictEncodedTypes; |
245 | |
246 | template <typename Type> |
247 | class TestDictionaryEncoding : public TestEncodingBase<Type> { |
248 | public: |
249 | typedef typename Type::c_type T; |
250 | static constexpr int TYPE = Type::type_num; |
251 | |
252 | void CheckRoundtrip() { |
253 | std::vector<uint8_t> valid_bits(BitUtil::BytesForBits(num_values_) + 1, 255); |
254 | DictEncoder<Type> encoder(descr_.get()); |
255 | |
256 | ASSERT_NO_THROW(encoder.Put(draws_, num_values_)); |
257 | dict_buffer_ = AllocateBuffer(default_memory_pool(), encoder.dict_encoded_size()); |
258 | encoder.WriteDict(dict_buffer_->mutable_data()); |
259 | std::shared_ptr<Buffer> indices = encoder.FlushValues(); |
260 | |
261 | DictEncoder<Type> spaced_encoder(descr_.get()); |
262 | // PutSpaced should lead to the same results |
263 | ASSERT_NO_THROW(spaced_encoder.PutSpaced(draws_, num_values_, valid_bits.data(), 0)); |
264 | std::shared_ptr<Buffer> indices_from_spaced = spaced_encoder.FlushValues(); |
265 | ASSERT_TRUE(indices_from_spaced->Equals(*indices)); |
266 | |
267 | PlainDecoder<Type> dict_decoder(descr_.get()); |
268 | dict_decoder.SetData(encoder.num_entries(), dict_buffer_->data(), |
269 | static_cast<int>(dict_buffer_->size())); |
270 | |
271 | DictionaryDecoder<Type> decoder(descr_.get()); |
272 | decoder.SetDict(&dict_decoder); |
273 | |
274 | decoder.SetData(num_values_, indices->data(), static_cast<int>(indices->size())); |
275 | int values_decoded = decoder.Decode(decode_buf_, num_values_); |
276 | ASSERT_EQ(num_values_, values_decoded); |
277 | |
278 | // TODO(wesm): The DictionaryDecoder must stay alive because the decoded |
279 | // values' data is owned by a buffer inside the DictionaryEncoder. We |
280 | // should revisit when data lifetime is reviewed more generally. |
281 | ASSERT_NO_FATAL_FAILURE(VerifyResults<T>(decode_buf_, draws_, num_values_)); |
282 | |
283 | // Also test spaced decoding |
284 | decoder.SetData(num_values_, indices->data(), static_cast<int>(indices->size())); |
285 | values_decoded = |
286 | decoder.DecodeSpaced(decode_buf_, num_values_, 0, valid_bits.data(), 0); |
287 | ASSERT_EQ(num_values_, values_decoded); |
288 | ASSERT_NO_FATAL_FAILURE(VerifyResults<T>(decode_buf_, draws_, num_values_)); |
289 | } |
290 | |
291 | protected: |
292 | USING_BASE_MEMBERS(); |
293 | std::shared_ptr<ResizableBuffer> dict_buffer_; |
294 | }; |
295 | |
296 | TYPED_TEST_CASE(TestDictionaryEncoding, DictEncodedTypes); |
297 | |
298 | TYPED_TEST(TestDictionaryEncoding, BasicRoundTrip) { |
299 | ASSERT_NO_FATAL_FAILURE(this->Execute(2500, 2)); |
300 | } |
301 | |
302 | TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) { |
303 | PlainDecoder<BooleanType> dict_decoder(nullptr); |
304 | DictionaryDecoder<BooleanType> decoder(nullptr); |
305 | |
306 | ASSERT_THROW(decoder.SetDict(&dict_decoder), ParquetException); |
307 | } |
308 | |
309 | } // namespace test |
310 | |
311 | } // namespace parquet |
312 | |