1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | // From Apache Impala as of 2016-01-29. Pared down to a minimal set of |
19 | // functions needed for parquet-cpp |
20 | |
21 | #ifndef ARROW_UTIL_SSE_UTIL_H |
22 | #define ARROW_UTIL_SSE_UTIL_H |
23 | |
24 | #undef ARROW_HAVE_SSE2 |
25 | #undef ARROW_HAVE_SSE4_2 |
26 | |
27 | #ifdef ARROW_USE_SIMD |
28 | |
29 | // MSVC x86-64 |
30 | |
31 | #if (defined(_M_AMD64) || defined(_M_X64)) |
32 | #define ARROW_HAVE_SSE2 1 |
33 | #define ARROW_HAVE_SSE4_2 1 |
34 | #include <intrin.h> |
35 | #endif |
36 | |
37 | // gcc/clang (possibly others) |
38 | |
39 | #if defined(__SSE4_2__) |
40 | #define ARROW_HAVE_SSE2 1 |
41 | #include <emmintrin.h> |
42 | #endif |
43 | |
44 | #if defined(__SSE4_2__) |
45 | #define ARROW_HAVE_SSE4_2 1 |
46 | #include <nmmintrin.h> |
47 | #endif |
48 | |
49 | #endif |
50 | |
51 | namespace arrow { |
52 | |
53 | /// This class contains constants useful for text processing with SSE4.2 intrinsics. |
54 | namespace SSEUtil { |
55 | /// Number of characters that fit in 64/128 bit register. SSE provides instructions |
56 | /// for loading 64 or 128 bits into a register at a time. |
57 | static const int CHARS_PER_64_BIT_REGISTER = 8; |
58 | static const int CHARS_PER_128_BIT_REGISTER = 16; |
59 | |
60 | /// SSE4.2 adds instructions for text processing. The instructions have a control |
61 | /// byte that determines some of functionality of the instruction. (Equivalent to |
62 | /// GCC's _SIDD_CMP_EQUAL_ANY, etc). |
63 | static const int PCMPSTR_EQUAL_ANY = 0x00; // strchr |
64 | static const int PCMPSTR_EQUAL_EACH = 0x08; // strcmp |
65 | static const int PCMPSTR_UBYTE_OPS = 0x00; // unsigned char (8-bits, rather than 16) |
66 | static const int PCMPSTR_NEG_POLARITY = 0x10; // see Intel SDM chapter 4.1.4. |
67 | |
68 | /// In this mode, SSE text processing functions will return a mask of all the |
69 | /// characters that matched. |
70 | static const int STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS; |
71 | |
72 | /// In this mode, SSE text processing functions will return the number of |
73 | /// bytes that match consecutively from the beginning. |
74 | static const int STRCMP_MODE = |
75 | PCMPSTR_EQUAL_EACH | PCMPSTR_UBYTE_OPS | PCMPSTR_NEG_POLARITY; |
76 | |
77 | /// Precomputed mask values up to 16 bits. |
78 | static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = { |
79 | 1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, |
80 | 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, |
81 | }; |
82 | } // namespace SSEUtil |
83 | |
84 | #ifdef ARROW_HAVE_SSE4_2 |
85 | |
86 | /// Define the SSE 4.2 intrinsics. The caller must first verify at runtime (or codegen |
87 | /// IR load time) that the processor supports SSE 4.2 before calling these. These are |
88 | /// defined outside the namespace because the IR w/ SSE 4.2 case needs to use macros. |
89 | |
90 | template <int MODE> |
91 | static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) { |
92 | return _mm_cmpestrm(str1, len1, str2, len2, MODE); |
93 | } |
94 | |
95 | template <int MODE> |
96 | static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) { |
97 | return _mm_cmpestri(str1, len1, str2, len2, MODE); |
98 | } |
99 | |
100 | static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) { |
101 | return _mm_crc32_u8(crc, v); |
102 | } |
103 | |
104 | static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) { |
105 | return _mm_crc32_u16(crc, v); |
106 | } |
107 | |
108 | static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) { |
109 | return _mm_crc32_u32(crc, v); |
110 | } |
111 | |
112 | static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) { |
113 | return static_cast<uint32_t>(_mm_crc32_u64(crc, v)); |
114 | } |
115 | |
116 | #else // without SSE 4.2. |
117 | |
118 | // __m128i may not be defined, so deduce it with a template parameter |
119 | template <int MODE, typename __m128i> |
120 | static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) { |
121 | DCHECK(false) << "CPU doesn't support SSE 4.2" ; |
122 | return (__m128i){0}; // NOLINT |
123 | } |
124 | |
125 | template <int MODE, typename __m128i> |
126 | static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) { |
127 | DCHECK(false) << "CPU doesn't support SSE 4.2" ; |
128 | return 0; |
129 | } |
130 | |
131 | static inline uint32_t SSE4_crc32_u8(uint32_t, uint8_t) { |
132 | DCHECK(false) << "SSE support is not enabled" ; |
133 | return 0; |
134 | } |
135 | |
136 | static inline uint32_t SSE4_crc32_u16(uint32_t, uint16_t) { |
137 | DCHECK(false) << "SSE support is not enabled" ; |
138 | return 0; |
139 | } |
140 | |
141 | static inline uint32_t SSE4_crc32_u32(uint32_t, uint32_t) { |
142 | DCHECK(false) << "SSE support is not enabled" ; |
143 | return 0; |
144 | } |
145 | |
146 | static inline uint32_t SSE4_crc32_u64(uint32_t, uint64_t) { |
147 | DCHECK(false) << "SSE support is not enabled" ; |
148 | return 0; |
149 | } |
150 | |
151 | #endif // ARROW_HAVE_SSE4_2 |
152 | |
153 | } // namespace arrow |
154 | |
155 | #endif // ARROW_UTIL_SSE_UTIL_H |
156 | |