1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include "arrow/test-util.h" |
19 | |
20 | #ifndef _WIN32 |
21 | #include <sys/stat.h> // IWYU pragma: keep |
22 | #include <sys/wait.h> // IWYU pragma: keep |
23 | #include <unistd.h> // IWYU pragma: keep |
24 | #endif |
25 | |
26 | #include <algorithm> |
27 | #include <cstdint> |
28 | #include <cstdlib> |
29 | #include <iostream> |
30 | #include <limits> |
31 | #include <memory> |
32 | #include <random> |
33 | #include <sstream> |
34 | #include <string> |
35 | #include <vector> |
36 | |
37 | #include <gtest/gtest.h> |
38 | |
39 | #include "arrow/array.h" |
40 | #include "arrow/buffer.h" |
41 | #include "arrow/ipc/json-simple.h" |
42 | #include "arrow/pretty_print.h" |
43 | #include "arrow/status.h" |
44 | #include "arrow/table.h" |
45 | #include "arrow/type.h" |
46 | #include "arrow/util/logging.h" |
47 | |
48 | namespace arrow { |
49 | |
50 | std::shared_ptr<Array> ArrayFromJSON(const std::shared_ptr<DataType>& type, |
51 | const std::string& json) { |
52 | std::shared_ptr<Array> out; |
53 | ABORT_NOT_OK(ipc::internal::json::ArrayFromJSON(type, json, &out)); |
54 | return out; |
55 | } |
56 | |
57 | void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) { |
58 | const int random_seed = 0; |
59 | std::default_random_engine gen(random_seed); |
60 | std::uniform_real_distribution<double> d(0.0, 1.0); |
61 | std::generate(null_bytes, null_bytes + n, |
62 | [&d, &gen, &pct_null] { return d(gen) > pct_null; }); |
63 | } |
64 | |
65 | void random_is_valid(int64_t n, double pct_null, std::vector<bool>* is_valid) { |
66 | const int random_seed = 0; |
67 | std::default_random_engine gen(random_seed); |
68 | std::uniform_real_distribution<double> d(0.0, 1.0); |
69 | is_valid->resize(n, false); |
70 | std::generate(is_valid->begin(), is_valid->end(), |
71 | [&d, &gen, &pct_null] { return d(gen) > pct_null; }); |
72 | } |
73 | |
74 | void random_bytes(int64_t n, uint32_t seed, uint8_t* out) { |
75 | std::default_random_engine gen(seed); |
76 | std::uniform_int_distribution<uint32_t> d(0, std::numeric_limits<uint8_t>::max()); |
77 | std::generate(out, out + n, [&d, &gen] { return static_cast<uint8_t>(d(gen)); }); |
78 | } |
79 | |
80 | int32_t DecimalSize(int32_t precision) { |
81 | DCHECK_GE(precision, 1) << "decimal precision must be greater than or equal to 1, got " |
82 | << precision; |
83 | DCHECK_LE(precision, 38) << "decimal precision must be less than or equal to 38, got " |
84 | << precision; |
85 | |
86 | switch (precision) { |
87 | case 1: |
88 | case 2: |
89 | return 1; // 127 |
90 | case 3: |
91 | case 4: |
92 | return 2; // 32,767 |
93 | case 5: |
94 | case 6: |
95 | return 3; // 8,388,607 |
96 | case 7: |
97 | case 8: |
98 | case 9: |
99 | return 4; // 2,147,483,427 |
100 | case 10: |
101 | case 11: |
102 | return 5; // 549,755,813,887 |
103 | case 12: |
104 | case 13: |
105 | case 14: |
106 | return 6; // 140,737,488,355,327 |
107 | case 15: |
108 | case 16: |
109 | return 7; // 36,028,797,018,963,967 |
110 | case 17: |
111 | case 18: |
112 | return 8; // 9,223,372,036,854,775,807 |
113 | case 19: |
114 | case 20: |
115 | case 21: |
116 | return 9; // 2,361,183,241,434,822,606,847 |
117 | case 22: |
118 | case 23: |
119 | return 10; // 604,462,909,807,314,587,353,087 |
120 | case 24: |
121 | case 25: |
122 | case 26: |
123 | return 11; // 154,742,504,910,672,534,362,390,527 |
124 | case 27: |
125 | case 28: |
126 | return 12; // 39,614,081,257,132,168,796,771,975,167 |
127 | case 29: |
128 | case 30: |
129 | case 31: |
130 | return 13; // 10,141,204,801,825,835,211,973,625,643,007 |
131 | case 32: |
132 | case 33: |
133 | return 14; // 2,596,148,429,267,413,814,265,248,164,610,047 |
134 | case 34: |
135 | case 35: |
136 | return 15; // 664,613,997,892,457,936,451,903,530,140,172,287 |
137 | case 36: |
138 | case 37: |
139 | case 38: |
140 | return 16; // 170,141,183,460,469,231,731,687,303,715,884,105,727 |
141 | default: |
142 | DCHECK(false); |
143 | break; |
144 | } |
145 | return -1; |
146 | } |
147 | |
148 | void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t* out) { |
149 | std::default_random_engine gen(seed); |
150 | std::uniform_int_distribution<uint32_t> d(0, std::numeric_limits<uint8_t>::max()); |
151 | const int32_t required_bytes = DecimalSize(precision); |
152 | constexpr int32_t byte_width = 16; |
153 | std::fill(out, out + byte_width * n, '\0'); |
154 | |
155 | for (int64_t i = 0; i < n; ++i, out += byte_width) { |
156 | std::generate(out, out + required_bytes, |
157 | [&d, &gen] { return static_cast<uint8_t>(d(gen)); }); |
158 | |
159 | // sign extend if the sign bit is set for the last byte generated |
160 | // 0b10000000 == 0x80 == 128 |
161 | if ((out[required_bytes - 1] & '\x80') != 0) { |
162 | std::fill(out + required_bytes, out + byte_width, '\xFF'); |
163 | } |
164 | } |
165 | } |
166 | |
167 | void random_ascii(int64_t n, uint32_t seed, uint8_t* out) { |
168 | rand_uniform_int(n, seed, static_cast<int32_t>('A'), static_cast<int32_t>('z'), out); |
169 | } |
170 | |
171 | int64_t CountNulls(const std::vector<uint8_t>& valid_bytes) { |
172 | return static_cast<int64_t>(std::count(valid_bytes.cbegin(), valid_bytes.cend(), '\0')); |
173 | } |
174 | |
175 | Status MakeRandomByteBuffer(int64_t length, MemoryPool* pool, |
176 | std::shared_ptr<ResizableBuffer>* out, uint32_t seed) { |
177 | std::shared_ptr<ResizableBuffer> result; |
178 | RETURN_NOT_OK(AllocateResizableBuffer(pool, length, &result)); |
179 | random_bytes(length, seed, result->mutable_data()); |
180 | *out = result; |
181 | return Status::OK(); |
182 | } |
183 | |
184 | void AssertArraysEqual(const Array& expected, const Array& actual) { |
185 | ASSERT_ARRAYS_EQUAL(expected, actual); |
186 | } |
187 | |
188 | void AssertChunkedEqual(const ChunkedArray& expected, const ChunkedArray& actual) { |
189 | ASSERT_EQ(expected.num_chunks(), actual.num_chunks()) << "# chunks unequal" ; |
190 | if (!actual.Equals(expected)) { |
191 | std::stringstream pp_result; |
192 | std::stringstream pp_expected; |
193 | |
194 | for (int i = 0; i < actual.num_chunks(); ++i) { |
195 | auto c1 = actual.chunk(i); |
196 | auto c2 = expected.chunk(i); |
197 | if (!c1->Equals(*c2)) { |
198 | EXPECT_OK(::arrow::PrettyPrint(*c1, 0, &pp_result)); |
199 | EXPECT_OK(::arrow::PrettyPrint(*c2, 0, &pp_expected)); |
200 | FAIL() << "Chunk " << i << " Got: " << pp_result.str() |
201 | << "\nExpected: " << pp_expected.str(); |
202 | } |
203 | } |
204 | } |
205 | } |
206 | |
207 | void AssertChunkedEqual(const ChunkedArray& actual, const ArrayVector& expected) { |
208 | AssertChunkedEqual(ChunkedArray(expected, actual.type()), actual); |
209 | } |
210 | |
211 | void AssertBufferEqual(const Buffer& buffer, const std::vector<uint8_t>& expected) { |
212 | ASSERT_EQ(buffer.size(), expected.size()) << "Mismatching buffer size" ; |
213 | const uint8_t* buffer_data = buffer.data(); |
214 | for (size_t i = 0; i < expected.size(); ++i) { |
215 | ASSERT_EQ(buffer_data[i], expected[i]); |
216 | } |
217 | } |
218 | |
219 | void AssertBufferEqual(const Buffer& buffer, const std::string& expected) { |
220 | ASSERT_EQ(buffer.size(), expected.length()) << "Mismatching buffer size" ; |
221 | const uint8_t* buffer_data = buffer.data(); |
222 | for (size_t i = 0; i < expected.size(); ++i) { |
223 | ASSERT_EQ(buffer_data[i], expected[i]); |
224 | } |
225 | } |
226 | |
227 | void AssertBufferEqual(const Buffer& buffer, const Buffer& expected) { |
228 | ASSERT_EQ(buffer.size(), expected.size()) << "Mismatching buffer size" ; |
229 | ASSERT_TRUE(buffer.Equals(expected)); |
230 | } |
231 | |
232 | void AssertSchemaEqual(const Schema& lhs, const Schema& rhs) { |
233 | if (!lhs.Equals(rhs)) { |
234 | std::stringstream ss; |
235 | ss << "left schema: " << lhs.ToString() << std::endl |
236 | << "right schema: " << rhs.ToString() << std::endl; |
237 | FAIL() << ss.str(); |
238 | } |
239 | } |
240 | |
241 | void PrintColumn(const Column& col, std::stringstream* ss) { |
242 | const ChunkedArray& carr = *col.data(); |
243 | for (int i = 0; i < carr.num_chunks(); ++i) { |
244 | auto c1 = carr.chunk(i); |
245 | *ss << "Chunk " << i << std::endl; |
246 | EXPECT_OK(::arrow::PrettyPrint(*c1, 0, ss)); |
247 | *ss << std::endl; |
248 | } |
249 | } |
250 | |
251 | void AssertTablesEqual(const Table& expected, const Table& actual, |
252 | bool same_chunk_layout) { |
253 | ASSERT_EQ(expected.num_columns(), actual.num_columns()); |
254 | |
255 | if (same_chunk_layout) { |
256 | for (int i = 0; i < actual.num_columns(); ++i) { |
257 | AssertChunkedEqual(*expected.column(i)->data(), *actual.column(i)->data()); |
258 | } |
259 | } else { |
260 | std::stringstream ss; |
261 | if (!actual.Equals(expected)) { |
262 | for (int i = 0; i < expected.num_columns(); ++i) { |
263 | ss << "Actual column " << i << std::endl; |
264 | PrintColumn(*actual.column(i), &ss); |
265 | |
266 | ss << "Expected column " << i << std::endl; |
267 | PrintColumn(*expected.column(i), &ss); |
268 | } |
269 | FAIL() << ss.str(); |
270 | } |
271 | } |
272 | } |
273 | |
274 | void CompareBatch(const RecordBatch& left, const RecordBatch& right) { |
275 | if (!left.schema()->Equals(*right.schema())) { |
276 | FAIL() << "Left schema: " << left.schema()->ToString() |
277 | << "\nRight schema: " << right.schema()->ToString(); |
278 | } |
279 | ASSERT_EQ(left.num_columns(), right.num_columns()) |
280 | << left.schema()->ToString() << " result: " << right.schema()->ToString(); |
281 | ASSERT_EQ(left.num_rows(), right.num_rows()); |
282 | for (int i = 0; i < left.num_columns(); ++i) { |
283 | if (!left.column(i)->Equals(right.column(i))) { |
284 | std::stringstream ss; |
285 | ss << "Idx: " << i << " Name: " << left.column_name(i); |
286 | ss << std::endl << "Left: " ; |
287 | ASSERT_OK(PrettyPrint(*left.column(i), 0, &ss)); |
288 | ss << std::endl << "Right: " ; |
289 | ASSERT_OK(PrettyPrint(*right.column(i), 0, &ss)); |
290 | FAIL() << ss.str(); |
291 | } |
292 | } |
293 | } |
294 | |
295 | namespace { |
296 | |
297 | // Used to prevent compiler optimizing away side-effect-less statements |
298 | volatile int throw_away = 0; |
299 | |
300 | } // namespace |
301 | |
302 | void AssertZeroPadded(const Array& array) { |
303 | for (const auto& buffer : array.data()->buffers) { |
304 | if (buffer) { |
305 | const int64_t padding = buffer->capacity() - buffer->size(); |
306 | if (padding > 0) { |
307 | std::vector<uint8_t> zeros(padding); |
308 | ASSERT_EQ(0, memcmp(buffer->data() + buffer->size(), zeros.data(), padding)); |
309 | } |
310 | } |
311 | } |
312 | } |
313 | |
314 | void TestInitialized(const Array& array) { |
315 | for (const auto& buffer : array.data()->buffers) { |
316 | if (buffer && buffer->capacity() > 0) { |
317 | int total = 0; |
318 | auto data = buffer->data(); |
319 | for (int64_t i = 0; i < buffer->size(); ++i) { |
320 | total ^= data[i]; |
321 | } |
322 | throw_away = total; |
323 | } |
324 | } |
325 | } |
326 | |
327 | } // namespace arrow |
328 | |