1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include "arrow/util/int-util.h" |
19 | |
20 | #include <algorithm> |
21 | #include <cstring> |
22 | #include <limits> |
23 | |
24 | #include "arrow/util/bit-util.h" |
25 | #include "arrow/util/logging.h" |
26 | |
27 | namespace arrow { |
28 | namespace internal { |
29 | |
30 | static constexpr uint64_t max_uint8 = |
31 | static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()); |
32 | static constexpr uint64_t max_uint16 = |
33 | static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()); |
34 | static constexpr uint64_t max_uint32 = |
35 | static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()); |
36 | static constexpr uint64_t max_uint64 = std::numeric_limits<uint64_t>::max(); |
37 | |
38 | static constexpr uint64_t mask_uint8 = ~0xffULL; |
39 | static constexpr uint64_t mask_uint16 = ~0xffffULL; |
40 | static constexpr uint64_t mask_uint32 = ~0xffffffffULL; |
41 | |
42 | // |
43 | // Unsigned integer width detection |
44 | // |
45 | |
46 | static const uint64_t max_uints[] = {0, max_uint8, max_uint16, 0, max_uint32, |
47 | 0, 0, 0, max_uint64}; |
48 | |
49 | // Check if we would need to expand the underlying storage type |
50 | inline uint8_t ExpandedUIntWidth(uint64_t val, uint8_t current_width) { |
51 | // Optimize for the common case where width doesn't change |
52 | if (ARROW_PREDICT_TRUE(val <= max_uints[current_width])) { |
53 | return current_width; |
54 | } |
55 | if (current_width == 1 && val <= max_uint8) { |
56 | return 1; |
57 | } else if (current_width <= 2 && val <= max_uint16) { |
58 | return 2; |
59 | } else if (current_width <= 4 && val <= max_uint32) { |
60 | return 4; |
61 | } else { |
62 | return 8; |
63 | } |
64 | } |
65 | |
66 | uint8_t DetectUIntWidth(const uint64_t* values, int64_t length, uint8_t min_width) { |
67 | uint8_t width = min_width; |
68 | if (min_width < 8) { |
69 | auto p = values; |
70 | const auto end = p + length; |
71 | while (p <= end - 16) { |
72 | // This is probably SIMD-izable |
73 | auto u = p[0]; |
74 | auto v = p[1]; |
75 | auto w = p[2]; |
76 | auto x = p[3]; |
77 | u |= p[4]; |
78 | v |= p[5]; |
79 | w |= p[6]; |
80 | x |= p[7]; |
81 | u |= p[8]; |
82 | v |= p[9]; |
83 | w |= p[10]; |
84 | x |= p[11]; |
85 | u |= p[12]; |
86 | v |= p[13]; |
87 | w |= p[14]; |
88 | x |= p[15]; |
89 | p += 16; |
90 | width = ExpandedUIntWidth(u | v | w | x, width); |
91 | if (ARROW_PREDICT_FALSE(width == 8)) { |
92 | break; |
93 | } |
94 | } |
95 | if (p <= end - 8) { |
96 | auto u = p[0]; |
97 | auto v = p[1]; |
98 | auto w = p[2]; |
99 | auto x = p[3]; |
100 | u |= p[4]; |
101 | v |= p[5]; |
102 | w |= p[6]; |
103 | x |= p[7]; |
104 | p += 8; |
105 | width = ExpandedUIntWidth(u | v | w | x, width); |
106 | } |
107 | while (p < end) { |
108 | width = ExpandedUIntWidth(*p++, width); |
109 | } |
110 | } |
111 | return width; |
112 | } |
113 | |
114 | uint8_t DetectUIntWidth(const uint64_t* values, const uint8_t* valid_bytes, |
115 | int64_t length, uint8_t min_width) { |
116 | if (valid_bytes == nullptr) { |
117 | return DetectUIntWidth(values, length, min_width); |
118 | } |
119 | uint8_t width = min_width; |
120 | if (min_width < 8) { |
121 | auto p = values; |
122 | const auto end = p + length; |
123 | auto b = valid_bytes; |
124 | |
125 | #define MASK(p, b, i) p[i] * (b[i] != 0) |
126 | |
127 | while (p <= end - 8) { |
128 | // This is probably be SIMD-izable |
129 | auto u = MASK(p, b, 0); |
130 | auto v = MASK(p, b, 1); |
131 | auto w = MASK(p, b, 2); |
132 | auto x = MASK(p, b, 3); |
133 | u |= MASK(p, b, 4); |
134 | v |= MASK(p, b, 5); |
135 | w |= MASK(p, b, 6); |
136 | x |= MASK(p, b, 7); |
137 | b += 8; |
138 | p += 8; |
139 | width = ExpandedUIntWidth(u | v | w | x, width); |
140 | if (ARROW_PREDICT_FALSE(width == 8)) { |
141 | break; |
142 | } |
143 | } |
144 | uint64_t mask = 0; |
145 | while (p < end) { |
146 | mask |= MASK(p, b, 0); |
147 | ++b; |
148 | ++p; |
149 | } |
150 | width = ExpandedUIntWidth(mask, width); |
151 | |
152 | #undef MASK |
153 | } |
154 | return width; |
155 | } |
156 | |
157 | // |
158 | // Signed integer width detection |
159 | // |
160 | |
161 | uint8_t DetectIntWidth(const int64_t* values, int64_t length, uint8_t min_width) { |
162 | if (min_width == 8) { |
163 | return min_width; |
164 | } |
165 | uint8_t width = min_width; |
166 | |
167 | auto p = values; |
168 | const auto end = p + length; |
169 | // Strategy: to determine whether `x` is between -0x80 and 0x7f, |
170 | // we determine whether `x + 0x80` is between 0x00 and 0xff. The |
171 | // latter can be done with a simple AND mask with ~0xff and, more |
172 | // importantly, can be computed in a single step over multiple ORed |
173 | // values (so we can branch once every N items instead of once every item). |
174 | // This strategy could probably lend itself to explicit SIMD-ization, |
175 | // if more performance is needed. |
176 | constexpr uint64_t addend8 = 0x80ULL; |
177 | constexpr uint64_t addend16 = 0x8000ULL; |
178 | constexpr uint64_t addend32 = 0x80000000ULL; |
179 | |
180 | auto test_one_item = [&](uint64_t addend, uint64_t test_mask) -> bool { |
181 | auto v = *p++; |
182 | if (ARROW_PREDICT_FALSE(((v + addend) & test_mask) != 0)) { |
183 | --p; |
184 | return false; |
185 | } else { |
186 | return true; |
187 | } |
188 | }; |
189 | |
190 | auto test_four_items = [&](uint64_t addend, uint64_t test_mask) -> bool { |
191 | auto mask = (p[0] + addend) | (p[1] + addend) | (p[2] + addend) | (p[3] + addend); |
192 | p += 4; |
193 | if (ARROW_PREDICT_FALSE((mask & test_mask) != 0)) { |
194 | p -= 4; |
195 | return false; |
196 | } else { |
197 | return true; |
198 | } |
199 | }; |
200 | |
201 | if (width == 1) { |
202 | while (p <= end - 4) { |
203 | if (!test_four_items(addend8, mask_uint8)) { |
204 | width = 2; |
205 | goto width2; |
206 | } |
207 | } |
208 | while (p < end) { |
209 | if (!test_one_item(addend8, mask_uint8)) { |
210 | width = 2; |
211 | goto width2; |
212 | } |
213 | } |
214 | return 1; |
215 | } |
216 | width2: |
217 | if (width == 2) { |
218 | while (p <= end - 4) { |
219 | if (!test_four_items(addend16, mask_uint16)) { |
220 | width = 4; |
221 | goto width4; |
222 | } |
223 | } |
224 | while (p < end) { |
225 | if (!test_one_item(addend16, mask_uint16)) { |
226 | width = 4; |
227 | goto width4; |
228 | } |
229 | } |
230 | return 2; |
231 | } |
232 | width4: |
233 | if (width == 4) { |
234 | while (p <= end - 4) { |
235 | if (!test_four_items(addend32, mask_uint32)) { |
236 | width = 8; |
237 | goto width8; |
238 | } |
239 | } |
240 | while (p < end) { |
241 | if (!test_one_item(addend32, mask_uint32)) { |
242 | width = 8; |
243 | goto width8; |
244 | } |
245 | } |
246 | return 4; |
247 | } |
248 | width8: |
249 | return 8; |
250 | } |
251 | |
252 | uint8_t DetectIntWidth(const int64_t* values, const uint8_t* valid_bytes, int64_t length, |
253 | uint8_t min_width) { |
254 | if (valid_bytes == nullptr) { |
255 | return DetectIntWidth(values, length, min_width); |
256 | } |
257 | |
258 | if (min_width == 8) { |
259 | return min_width; |
260 | } |
261 | uint8_t width = min_width; |
262 | |
263 | auto p = values; |
264 | const auto end = p + length; |
265 | auto b = valid_bytes; |
266 | // Strategy is similar to the no-nulls case above, but we also |
267 | // have to zero any incoming items that have a zero validity byte. |
268 | constexpr uint64_t addend8 = 0x80ULL; |
269 | constexpr uint64_t addend16 = 0x8000ULL; |
270 | constexpr uint64_t addend32 = 0x80000000ULL; |
271 | |
272 | #define MASK(p, b, addend, i) (p[i] + addend) * (b[i] != 0) |
273 | |
274 | auto test_one_item = [&](uint64_t addend, uint64_t test_mask) -> bool { |
275 | auto v = MASK(p, b, addend, 0); |
276 | ++b; |
277 | ++p; |
278 | if (ARROW_PREDICT_FALSE((v & test_mask) != 0)) { |
279 | --b; |
280 | --p; |
281 | return false; |
282 | } else { |
283 | return true; |
284 | } |
285 | }; |
286 | |
287 | auto test_eight_items = [&](uint64_t addend, uint64_t test_mask) -> bool { |
288 | auto mask1 = MASK(p, b, addend, 0) | MASK(p, b, addend, 1) | MASK(p, b, addend, 2) | |
289 | MASK(p, b, addend, 3); |
290 | auto mask2 = MASK(p, b, addend, 4) | MASK(p, b, addend, 5) | MASK(p, b, addend, 6) | |
291 | MASK(p, b, addend, 7); |
292 | b += 8; |
293 | p += 8; |
294 | if (ARROW_PREDICT_FALSE(((mask1 | mask2) & test_mask) != 0)) { |
295 | b -= 8; |
296 | p -= 8; |
297 | return false; |
298 | } else { |
299 | return true; |
300 | } |
301 | }; |
302 | |
303 | #undef MASK |
304 | |
305 | if (width == 1) { |
306 | while (p <= end - 8) { |
307 | if (!test_eight_items(addend8, mask_uint8)) { |
308 | width = 2; |
309 | goto width2; |
310 | } |
311 | } |
312 | while (p < end) { |
313 | if (!test_one_item(addend8, mask_uint8)) { |
314 | width = 2; |
315 | goto width2; |
316 | } |
317 | } |
318 | return 1; |
319 | } |
320 | width2: |
321 | if (width == 2) { |
322 | while (p <= end - 8) { |
323 | if (!test_eight_items(addend16, mask_uint16)) { |
324 | width = 4; |
325 | goto width4; |
326 | } |
327 | } |
328 | while (p < end) { |
329 | if (!test_one_item(addend16, mask_uint16)) { |
330 | width = 4; |
331 | goto width4; |
332 | } |
333 | } |
334 | return 2; |
335 | } |
336 | width4: |
337 | if (width == 4) { |
338 | while (p <= end - 8) { |
339 | if (!test_eight_items(addend32, mask_uint32)) { |
340 | width = 8; |
341 | goto width8; |
342 | } |
343 | } |
344 | while (p < end) { |
345 | if (!test_one_item(addend32, mask_uint32)) { |
346 | width = 8; |
347 | goto width8; |
348 | } |
349 | } |
350 | return 4; |
351 | } |
352 | width8: |
353 | return 8; |
354 | } |
355 | |
356 | template <typename Source, typename Dest> |
357 | inline void DowncastIntsInternal(const Source* src, Dest* dest, int64_t length) { |
358 | while (length >= 4) { |
359 | dest[0] = static_cast<Dest>(src[0]); |
360 | dest[1] = static_cast<Dest>(src[1]); |
361 | dest[2] = static_cast<Dest>(src[2]); |
362 | dest[3] = static_cast<Dest>(src[3]); |
363 | length -= 4; |
364 | src += 4; |
365 | dest += 4; |
366 | } |
367 | while (length > 0) { |
368 | *dest++ = static_cast<Dest>(*src++); |
369 | --length; |
370 | } |
371 | } |
372 | |
373 | void DowncastInts(const int64_t* source, int8_t* dest, int64_t length) { |
374 | DowncastIntsInternal(source, dest, length); |
375 | } |
376 | |
377 | void DowncastInts(const int64_t* source, int16_t* dest, int64_t length) { |
378 | DowncastIntsInternal(source, dest, length); |
379 | } |
380 | |
381 | void DowncastInts(const int64_t* source, int32_t* dest, int64_t length) { |
382 | DowncastIntsInternal(source, dest, length); |
383 | } |
384 | |
385 | void DowncastInts(const int64_t* source, int64_t* dest, int64_t length) { |
386 | memcpy(dest, source, length * sizeof(int64_t)); |
387 | } |
388 | |
389 | void DowncastUInts(const uint64_t* source, uint8_t* dest, int64_t length) { |
390 | DowncastIntsInternal(source, dest, length); |
391 | } |
392 | |
393 | void DowncastUInts(const uint64_t* source, uint16_t* dest, int64_t length) { |
394 | DowncastIntsInternal(source, dest, length); |
395 | } |
396 | |
397 | void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length) { |
398 | DowncastIntsInternal(source, dest, length); |
399 | } |
400 | |
401 | void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length) { |
402 | memcpy(dest, source, length * sizeof(int64_t)); |
403 | } |
404 | |
405 | template <typename InputInt, typename OutputInt> |
406 | void TransposeInts(const InputInt* src, OutputInt* dest, int64_t length, |
407 | const int32_t* transpose_map) { |
408 | while (length >= 4) { |
409 | dest[0] = static_cast<OutputInt>(transpose_map[src[0]]); |
410 | dest[1] = static_cast<OutputInt>(transpose_map[src[1]]); |
411 | dest[2] = static_cast<OutputInt>(transpose_map[src[2]]); |
412 | dest[3] = static_cast<OutputInt>(transpose_map[src[3]]); |
413 | length -= 4; |
414 | src += 4; |
415 | dest += 4; |
416 | } |
417 | while (length > 0) { |
418 | *dest++ = static_cast<OutputInt>(transpose_map[*src++]); |
419 | --length; |
420 | } |
421 | } |
422 | |
423 | #define INSTANTIATE(SRC, DEST) \ |
424 | template ARROW_EXPORT void TransposeInts( \ |
425 | const SRC* source, DEST* dest, int64_t length, const int32_t* transpose_map); |
426 | |
427 | #define INSTANTIATE_ALL_DEST(DEST) \ |
428 | INSTANTIATE(int8_t, DEST) \ |
429 | INSTANTIATE(int16_t, DEST) \ |
430 | INSTANTIATE(int32_t, DEST) \ |
431 | INSTANTIATE(int64_t, DEST) |
432 | |
433 | #define INSTANTIATE_ALL() \ |
434 | INSTANTIATE_ALL_DEST(int8_t) \ |
435 | INSTANTIATE_ALL_DEST(int16_t) \ |
436 | INSTANTIATE_ALL_DEST(int32_t) \ |
437 | INSTANTIATE_ALL_DEST(int64_t) |
438 | |
439 | INSTANTIATE_ALL() |
440 | |
441 | #undef INSTANTIATE |
442 | #undef INSTANTIATE_ALL |
443 | #undef INSTANTIATE_ALL_DEST |
444 | |
445 | } // namespace internal |
446 | } // namespace arrow |
447 | |