1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #pragma once |
19 | |
20 | #include <algorithm> |
21 | #include <cstdint> |
22 | #include <cstdlib> |
23 | #include <cstring> |
24 | #include <iostream> |
25 | #include <limits> |
26 | #include <memory> |
27 | #include <random> |
28 | #include <sstream> |
29 | #include <string> |
30 | #include <type_traits> |
31 | #include <vector> |
32 | |
33 | #include <gtest/gtest.h> |
34 | |
35 | #include "arrow/array.h" |
36 | #include "arrow/buffer.h" |
37 | #include "arrow/builder.h" |
38 | #include "arrow/memory_pool.h" |
39 | #include "arrow/pretty_print.h" |
40 | #include "arrow/record_batch.h" |
41 | #include "arrow/status.h" |
42 | #include "arrow/type.h" |
43 | #include "arrow/type_traits.h" |
44 | #include "arrow/util/bit-util.h" |
45 | #include "arrow/util/logging.h" |
46 | #include "arrow/util/macros.h" |
47 | #include "arrow/util/visibility.h" |
48 | |
49 | #define ASSERT_RAISES(ENUM, expr) \ |
50 | do { \ |
51 | ::arrow::Status s = (expr); \ |
52 | if (!s.Is##ENUM()) { \ |
53 | FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \ |
54 | ENUM) ", but got " \ |
55 | << s.ToString(); \ |
56 | } \ |
57 | } while (false) |
58 | |
59 | #define ASSERT_RAISES_WITH_MESSAGE(ENUM, message, expr) \ |
60 | do { \ |
61 | ::arrow::Status s = (expr); \ |
62 | if (!s.Is##ENUM()) { \ |
63 | FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \ |
64 | ENUM) ", but got " \ |
65 | << s.ToString(); \ |
66 | } \ |
67 | ASSERT_EQ((message), s.ToString()); \ |
68 | } while (false) |
69 | |
70 | #define ASSERT_OK(expr) \ |
71 | do { \ |
72 | ::arrow::Status _s = (expr); \ |
73 | if (!_s.ok()) { \ |
74 | FAIL() << "'" ARROW_STRINGIFY(expr) "' failed with " << _s.ToString(); \ |
75 | } \ |
76 | } while (false) |
77 | |
78 | #define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr)) |
79 | |
80 | #define EXPECT_OK(expr) \ |
81 | do { \ |
82 | ::arrow::Status s = (expr); \ |
83 | EXPECT_TRUE(s.ok()); \ |
84 | } while (false) |
85 | |
86 | #define ABORT_NOT_OK(s) \ |
87 | do { \ |
88 | ::arrow::Status _s = (s); \ |
89 | if (ARROW_PREDICT_FALSE(!_s.ok())) { \ |
90 | std::cerr << s.ToString() << "\n"; \ |
91 | std::abort(); \ |
92 | } \ |
93 | } while (false); |
94 | |
95 | namespace arrow { |
96 | |
97 | class ChunkedArray; |
98 | class Column; |
99 | class Table; |
100 | |
101 | using ArrayVector = std::vector<std::shared_ptr<Array>>; |
102 | |
103 | #define ASSERT_ARRAYS_EQUAL(LEFT, RIGHT) \ |
104 | do { \ |
105 | if (!(LEFT).Equals((RIGHT))) { \ |
106 | std::stringstream pp_result; \ |
107 | std::stringstream pp_expected; \ |
108 | \ |
109 | EXPECT_OK(PrettyPrint(RIGHT, 0, &pp_result)); \ |
110 | EXPECT_OK(PrettyPrint(LEFT, 0, &pp_expected)); \ |
111 | FAIL() << "Got: \n" << pp_result.str() << "\nExpected: \n" << pp_expected.str(); \ |
112 | } \ |
113 | } while (false) |
114 | |
115 | template <typename T, typename U> |
116 | void randint(int64_t N, T lower, T upper, std::vector<U>* out) { |
117 | const int random_seed = 0; |
118 | std::default_random_engine gen(random_seed); |
119 | std::uniform_int_distribution<T> d(lower, upper); |
120 | out->resize(N, static_cast<T>(0)); |
121 | std::generate(out->begin(), out->end(), [&d, &gen] { return static_cast<U>(d(gen)); }); |
122 | } |
123 | |
124 | template <typename T, typename U> |
125 | void random_real(int64_t n, uint32_t seed, T min_value, T max_value, |
126 | std::vector<U>* out) { |
127 | std::default_random_engine gen(seed); |
128 | std::uniform_real_distribution<T> d(min_value, max_value); |
129 | out->resize(n, static_cast<T>(0)); |
130 | std::generate(out->begin(), out->end(), [&d, &gen] { return static_cast<U>(d(gen)); }); |
131 | } |
132 | |
133 | template <typename T> |
134 | inline Status CopyBufferFromVector(const std::vector<T>& values, MemoryPool* pool, |
135 | std::shared_ptr<Buffer>* result) { |
136 | int64_t nbytes = static_cast<int>(values.size()) * sizeof(T); |
137 | |
138 | std::shared_ptr<Buffer> buffer; |
139 | RETURN_NOT_OK(AllocateBuffer(pool, nbytes, &buffer)); |
140 | auto immutable_data = reinterpret_cast<const uint8_t*>(values.data()); |
141 | std::copy(immutable_data, immutable_data + nbytes, buffer->mutable_data()); |
142 | memset(buffer->mutable_data() + nbytes, 0, |
143 | static_cast<size_t>(buffer->capacity() - nbytes)); |
144 | |
145 | *result = buffer; |
146 | return Status::OK(); |
147 | } |
148 | |
149 | template <typename T> |
150 | static inline Status GetBitmapFromVector(const std::vector<T>& is_valid, |
151 | std::shared_ptr<Buffer>* result) { |
152 | size_t length = is_valid.size(); |
153 | |
154 | std::shared_ptr<Buffer> buffer; |
155 | RETURN_NOT_OK(AllocateEmptyBitmap(length, &buffer)); |
156 | |
157 | uint8_t* bitmap = buffer->mutable_data(); |
158 | for (size_t i = 0; i < static_cast<size_t>(length); ++i) { |
159 | if (is_valid[i]) { |
160 | BitUtil::SetBit(bitmap, i); |
161 | } |
162 | } |
163 | |
164 | *result = buffer; |
165 | return Status::OK(); |
166 | } |
167 | |
168 | template <typename T> |
169 | inline void BitmapFromVector(const std::vector<T>& is_valid, |
170 | std::shared_ptr<Buffer>* out) { |
171 | ASSERT_OK(GetBitmapFromVector(is_valid, out)); |
172 | } |
173 | |
174 | // Sets approximately pct_null of the first n bytes in null_bytes to zero |
175 | // and the rest to non-zero (true) values. |
176 | ARROW_EXPORT void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes); |
177 | ARROW_EXPORT void random_is_valid(int64_t n, double pct_null, |
178 | std::vector<bool>* is_valid); |
179 | ARROW_EXPORT void random_bytes(int64_t n, uint32_t seed, uint8_t* out); |
180 | ARROW_EXPORT int32_t DecimalSize(int32_t precision); |
181 | ARROW_EXPORT void random_decimals(int64_t n, uint32_t seed, int32_t precision, |
182 | uint8_t* out); |
183 | ARROW_EXPORT void random_ascii(int64_t n, uint32_t seed, uint8_t* out); |
184 | ARROW_EXPORT int64_t CountNulls(const std::vector<uint8_t>& valid_bytes); |
185 | |
186 | ARROW_EXPORT Status MakeRandomByteBuffer(int64_t length, MemoryPool* pool, |
187 | std::shared_ptr<ResizableBuffer>* out, |
188 | uint32_t seed = 0); |
189 | |
190 | ARROW_EXPORT void AssertArraysEqual(const Array& expected, const Array& actual); |
191 | ARROW_EXPORT void AssertChunkedEqual(const ChunkedArray& expected, |
192 | const ChunkedArray& actual); |
193 | ARROW_EXPORT void AssertChunkedEqual(const ChunkedArray& actual, |
194 | const ArrayVector& expected); |
195 | ARROW_EXPORT void AssertBufferEqual(const Buffer& buffer, |
196 | const std::vector<uint8_t>& expected); |
197 | ARROW_EXPORT void AssertBufferEqual(const Buffer& buffer, const std::string& expected); |
198 | ARROW_EXPORT void AssertBufferEqual(const Buffer& buffer, const Buffer& expected); |
199 | ARROW_EXPORT void AssertSchemaEqual(const Schema& lhs, const Schema& rhs); |
200 | |
201 | ARROW_EXPORT void PrintColumn(const Column& col, std::stringstream* ss); |
202 | ARROW_EXPORT void AssertTablesEqual(const Table& expected, const Table& actual, |
203 | bool same_chunk_layout = true); |
204 | |
205 | template <typename C_TYPE> |
206 | void AssertNumericDataEqual(const C_TYPE* raw_data, |
207 | const std::vector<C_TYPE>& expected_values) { |
208 | for (auto expected : expected_values) { |
209 | ASSERT_EQ(expected, *raw_data); |
210 | ++raw_data; |
211 | } |
212 | } |
213 | |
214 | ARROW_EXPORT void CompareBatch(const RecordBatch& left, const RecordBatch& right); |
215 | |
216 | // Check if the padding of the buffers of the array is zero. |
217 | // Also cause valgrind warnings if the padding bytes are uninitialized. |
218 | ARROW_EXPORT void AssertZeroPadded(const Array& array); |
219 | |
220 | // Check if the valid buffer bytes are initialized |
221 | // and cause valgrind warnings otherwise. |
222 | ARROW_EXPORT void TestInitialized(const Array& array); |
223 | |
224 | template <typename BuilderType> |
225 | void FinishAndCheckPadding(BuilderType* builder, std::shared_ptr<Array>* out) { |
226 | ASSERT_OK(builder->Finish(out)); |
227 | AssertZeroPadded(**out); |
228 | TestInitialized(**out); |
229 | } |
230 | |
231 | template <typename T, typename U> |
232 | void rand_uniform_int(int64_t n, uint32_t seed, T min_value, T max_value, U* out) { |
233 | DCHECK(out || (n == 0)); |
234 | std::default_random_engine gen(seed); |
235 | std::uniform_int_distribution<T> d(min_value, max_value); |
236 | std::generate(out, out + n, [&d, &gen] { return static_cast<U>(d(gen)); }); |
237 | } |
238 | |
239 | template <typename T, typename Enable = void> |
240 | struct GenerateRandom {}; |
241 | |
242 | template <typename T> |
243 | struct GenerateRandom<T, typename std::enable_if<std::is_integral<T>::value>::type> { |
244 | static void Gen(int64_t length, uint32_t seed, void* out) { |
245 | rand_uniform_int(length, seed, std::numeric_limits<T>::min(), |
246 | std::numeric_limits<T>::max(), reinterpret_cast<T*>(out)); |
247 | } |
248 | }; |
249 | |
250 | template <typename T> |
251 | Status MakeRandomBuffer(int64_t length, MemoryPool* pool, |
252 | std::shared_ptr<ResizableBuffer>* out, uint32_t seed = 0) { |
253 | DCHECK(pool); |
254 | std::shared_ptr<ResizableBuffer> result; |
255 | RETURN_NOT_OK(AllocateResizableBuffer(pool, sizeof(T) * length, &result)); |
256 | GenerateRandom<T>::Gen(length, seed, result->mutable_data()); |
257 | *out = result; |
258 | return Status::OK(); |
259 | } |
260 | |
261 | // ArrayFromJSON: construct an Array from a simple JSON representation |
262 | |
263 | ARROW_EXPORT |
264 | std::shared_ptr<Array> ArrayFromJSON(const std::shared_ptr<DataType>&, |
265 | const std::string& json); |
266 | |
267 | // ArrayFromVector: construct an Array from vectors of C values |
268 | |
269 | template <typename TYPE, typename C_TYPE = typename TYPE::c_type> |
270 | void ArrayFromVector(const std::shared_ptr<DataType>& type, |
271 | const std::vector<bool>& is_valid, const std::vector<C_TYPE>& values, |
272 | std::shared_ptr<Array>* out) { |
273 | DCHECK_EQ(TYPE::type_id, type->id()) |
274 | << "template parameter and concrete DataType instance don't agree" ; |
275 | |
276 | std::unique_ptr<ArrayBuilder> builder_ptr; |
277 | ASSERT_OK(MakeBuilder(default_memory_pool(), type, &builder_ptr)); |
278 | // Get the concrete builder class to access its Append() specializations |
279 | auto& builder = dynamic_cast<typename TypeTraits<TYPE>::BuilderType&>(*builder_ptr); |
280 | |
281 | for (size_t i = 0; i < values.size(); ++i) { |
282 | if (is_valid[i]) { |
283 | ASSERT_OK(builder.Append(values[i])); |
284 | } else { |
285 | ASSERT_OK(builder.AppendNull()); |
286 | } |
287 | } |
288 | ASSERT_OK(builder.Finish(out)); |
289 | } |
290 | |
291 | template <typename TYPE, typename C_TYPE = typename TYPE::c_type> |
292 | void ArrayFromVector(const std::shared_ptr<DataType>& type, |
293 | const std::vector<C_TYPE>& values, std::shared_ptr<Array>* out) { |
294 | DCHECK_EQ(TYPE::type_id, type->id()) |
295 | << "template parameter and concrete DataType instance don't agree" ; |
296 | |
297 | std::unique_ptr<ArrayBuilder> builder_ptr; |
298 | ASSERT_OK(MakeBuilder(default_memory_pool(), type, &builder_ptr)); |
299 | // Get the concrete builder class to access its Append() specializations |
300 | auto& builder = dynamic_cast<typename TypeTraits<TYPE>::BuilderType&>(*builder_ptr); |
301 | |
302 | for (size_t i = 0; i < values.size(); ++i) { |
303 | ASSERT_OK(builder.Append(values[i])); |
304 | } |
305 | ASSERT_OK(builder.Finish(out)); |
306 | } |
307 | |
308 | // Overloads without a DataType argument, for parameterless types |
309 | |
310 | template <typename TYPE, typename C_TYPE = typename TYPE::c_type> |
311 | void ArrayFromVector(const std::vector<bool>& is_valid, const std::vector<C_TYPE>& values, |
312 | std::shared_ptr<Array>* out) { |
313 | auto type = TypeTraits<TYPE>::type_singleton(); |
314 | ArrayFromVector<TYPE, C_TYPE>(type, is_valid, values, out); |
315 | } |
316 | |
317 | template <typename TYPE, typename C_TYPE = typename TYPE::c_type> |
318 | void ArrayFromVector(const std::vector<C_TYPE>& values, std::shared_ptr<Array>* out) { |
319 | auto type = TypeTraits<TYPE>::type_singleton(); |
320 | ArrayFromVector<TYPE, C_TYPE>(type, values, out); |
321 | } |
322 | |
323 | // ChunkedArrayFromVector: construct a ChunkedArray from vectors of C values |
324 | |
325 | template <typename TYPE, typename C_TYPE = typename TYPE::c_type> |
326 | void ChunkedArrayFromVector(const std::shared_ptr<DataType>& type, |
327 | const std::vector<std::vector<bool>>& is_valid, |
328 | const std::vector<std::vector<C_TYPE>>& values, |
329 | std::shared_ptr<ChunkedArray>* out) { |
330 | ArrayVector chunks; |
331 | DCHECK_EQ(is_valid.size(), values.size()); |
332 | for (size_t i = 0; i < values.size(); ++i) { |
333 | std::shared_ptr<Array> array; |
334 | ArrayFromVector<TYPE, C_TYPE>(type, is_valid[i], values[i], &array); |
335 | chunks.push_back(array); |
336 | } |
337 | *out = std::make_shared<ChunkedArray>(chunks); |
338 | } |
339 | |
340 | template <typename TYPE, typename C_TYPE = typename TYPE::c_type> |
341 | void ChunkedArrayFromVector(const std::shared_ptr<DataType>& type, |
342 | const std::vector<std::vector<C_TYPE>>& values, |
343 | std::shared_ptr<ChunkedArray>* out) { |
344 | ArrayVector chunks; |
345 | for (size_t i = 0; i < values.size(); ++i) { |
346 | std::shared_ptr<Array> array; |
347 | ArrayFromVector<TYPE, C_TYPE>(type, values[i], &array); |
348 | chunks.push_back(array); |
349 | } |
350 | *out = std::make_shared<ChunkedArray>(chunks); |
351 | } |
352 | |
353 | // Overloads without a DataType argument, for parameterless types |
354 | |
355 | template <typename TYPE, typename C_TYPE = typename TYPE::c_type> |
356 | void ChunkedArrayFromVector(const std::vector<std::vector<bool>>& is_valid, |
357 | const std::vector<std::vector<C_TYPE>>& values, |
358 | std::shared_ptr<ChunkedArray>* out) { |
359 | auto type = TypeTraits<TYPE>::type_singleton(); |
360 | ChunkedArrayFromVector<TYPE, C_TYPE>(type, is_valid, values, out); |
361 | } |
362 | |
363 | template <typename TYPE, typename C_TYPE = typename TYPE::c_type> |
364 | void ChunkedArrayFromVector(const std::vector<std::vector<C_TYPE>>& values, |
365 | std::shared_ptr<ChunkedArray>* out) { |
366 | auto type = TypeTraits<TYPE>::type_singleton(); |
367 | ChunkedArrayFromVector<TYPE, C_TYPE>(type, values, out); |
368 | } |
369 | |
370 | template <class T, class Builder> |
371 | Status MakeArray(const std::vector<uint8_t>& valid_bytes, const std::vector<T>& values, |
372 | int64_t size, Builder* builder, std::shared_ptr<Array>* out) { |
373 | // Append the first 1000 |
374 | for (int64_t i = 0; i < size; ++i) { |
375 | if (valid_bytes[i] > 0) { |
376 | RETURN_NOT_OK(builder->Append(values[i])); |
377 | } else { |
378 | RETURN_NOT_OK(builder->AppendNull()); |
379 | } |
380 | } |
381 | return builder->Finish(out); |
382 | } |
383 | |
384 | #define DECL_T() typedef typename TestFixture::T T; |
385 | |
386 | #define DECL_TYPE() typedef typename TestFixture::Type Type; |
387 | |
388 | #define ASSERT_BATCHES_EQUAL(LEFT, RIGHT) \ |
389 | do { \ |
390 | if (!(LEFT).ApproxEquals(RIGHT)) { \ |
391 | std::stringstream ss; \ |
392 | ss << "Left:\n"; \ |
393 | ASSERT_OK(PrettyPrint(LEFT, 0, &ss)); \ |
394 | \ |
395 | ss << "\nRight:\n"; \ |
396 | ASSERT_OK(PrettyPrint(RIGHT, 0, &ss)); \ |
397 | FAIL() << ss.str(); \ |
398 | } \ |
399 | } while (false) |
400 | |
401 | // ---------------------------------------------------------------------- |
402 | // A RecordBatchReader for serving a sequence of in-memory record batches |
403 | |
404 | class BatchIterator : public RecordBatchReader { |
405 | public: |
406 | BatchIterator(const std::shared_ptr<Schema>& schema, |
407 | const std::vector<std::shared_ptr<RecordBatch>>& batches) |
408 | : schema_(schema), batches_(batches), position_(0) {} |
409 | |
410 | std::shared_ptr<Schema> schema() const override { return schema_; } |
411 | |
412 | Status ReadNext(std::shared_ptr<RecordBatch>* out) override { |
413 | if (position_ >= batches_.size()) { |
414 | *out = nullptr; |
415 | } else { |
416 | *out = batches_[position_++]; |
417 | } |
418 | return Status::OK(); |
419 | } |
420 | |
421 | private: |
422 | std::shared_ptr<Schema> schema_; |
423 | std::vector<std::shared_ptr<RecordBatch>> batches_; |
424 | size_t position_; |
425 | }; |
426 | |
427 | } // namespace arrow |
428 | |