1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18// From Apache Impala as of 2016-01-29. Pared down to a minimal set of
19// functions needed for parquet-cpp
20
21#ifndef ARROW_UTIL_SSE_UTIL_H
22#define ARROW_UTIL_SSE_UTIL_H
23
24#undef ARROW_HAVE_SSE2
25#undef ARROW_HAVE_SSE4_2
26
27#ifdef ARROW_USE_SIMD
28
29// MSVC x86-64
30
31#if (defined(_M_AMD64) || defined(_M_X64))
32#define ARROW_HAVE_SSE2 1
33#define ARROW_HAVE_SSE4_2 1
34#include <intrin.h>
35#endif
36
37// gcc/clang (possibly others)
38
39#if defined(__SSE4_2__)
40#define ARROW_HAVE_SSE2 1
41#include <emmintrin.h>
42#endif
43
44#if defined(__SSE4_2__)
45#define ARROW_HAVE_SSE4_2 1
46#include <nmmintrin.h>
47#endif
48
49#endif
50
51namespace arrow {
52
53/// This class contains constants useful for text processing with SSE4.2 intrinsics.
54namespace SSEUtil {
55/// Number of characters that fit in 64/128 bit register. SSE provides instructions
56/// for loading 64 or 128 bits into a register at a time.
57static const int CHARS_PER_64_BIT_REGISTER = 8;
58static const int CHARS_PER_128_BIT_REGISTER = 16;
59
60/// SSE4.2 adds instructions for text processing. The instructions have a control
61/// byte that determines some of functionality of the instruction. (Equivalent to
62/// GCC's _SIDD_CMP_EQUAL_ANY, etc).
63static const int PCMPSTR_EQUAL_ANY = 0x00; // strchr
64static const int PCMPSTR_EQUAL_EACH = 0x08; // strcmp
65static const int PCMPSTR_UBYTE_OPS = 0x00; // unsigned char (8-bits, rather than 16)
66static const int PCMPSTR_NEG_POLARITY = 0x10; // see Intel SDM chapter 4.1.4.
67
68/// In this mode, SSE text processing functions will return a mask of all the
69/// characters that matched.
70static const int STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS;
71
72/// In this mode, SSE text processing functions will return the number of
73/// bytes that match consecutively from the beginning.
74static const int STRCMP_MODE =
75 PCMPSTR_EQUAL_EACH | PCMPSTR_UBYTE_OPS | PCMPSTR_NEG_POLARITY;
76
77/// Precomputed mask values up to 16 bits.
78static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = {
79 1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7,
80 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15,
81};
82} // namespace SSEUtil
83
84#ifdef ARROW_HAVE_SSE4_2
85
86/// Define the SSE 4.2 intrinsics. The caller must first verify at runtime (or codegen
87/// IR load time) that the processor supports SSE 4.2 before calling these. These are
88/// defined outside the namespace because the IR w/ SSE 4.2 case needs to use macros.
89
90template <int MODE>
91static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
92 return _mm_cmpestrm(str1, len1, str2, len2, MODE);
93}
94
95template <int MODE>
96static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
97 return _mm_cmpestri(str1, len1, str2, len2, MODE);
98}
99
100static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
101 return _mm_crc32_u8(crc, v);
102}
103
104static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) {
105 return _mm_crc32_u16(crc, v);
106}
107
108static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
109 return _mm_crc32_u32(crc, v);
110}
111
112static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
113 return static_cast<uint32_t>(_mm_crc32_u64(crc, v));
114}
115
116#else // without SSE 4.2.
117
118// __m128i may not be defined, so deduce it with a template parameter
119template <int MODE, typename __m128i>
120static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
121 DCHECK(false) << "CPU doesn't support SSE 4.2";
122 return (__m128i){0}; // NOLINT
123}
124
125template <int MODE, typename __m128i>
126static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
127 DCHECK(false) << "CPU doesn't support SSE 4.2";
128 return 0;
129}
130
131static inline uint32_t SSE4_crc32_u8(uint32_t, uint8_t) {
132 DCHECK(false) << "SSE support is not enabled";
133 return 0;
134}
135
136static inline uint32_t SSE4_crc32_u16(uint32_t, uint16_t) {
137 DCHECK(false) << "SSE support is not enabled";
138 return 0;
139}
140
141static inline uint32_t SSE4_crc32_u32(uint32_t, uint32_t) {
142 DCHECK(false) << "SSE support is not enabled";
143 return 0;
144}
145
146static inline uint32_t SSE4_crc32_u64(uint32_t, uint64_t) {
147 DCHECK(false) << "SSE support is not enabled";
148 return 0;
149}
150
151#endif // ARROW_HAVE_SSE4_2
152
153} // namespace arrow
154
155#endif // ARROW_UTIL_SSE_UTIL_H
156