1 | /* |
2 | * Copyright 2015-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #include <folly/detail/RangeSse42.h> |
18 | |
19 | #include <glog/logging.h> |
20 | |
21 | #include <folly/Portability.h> |
22 | |
23 | // Essentially, two versions of this file: one with an SSE42 implementation |
24 | // and one with a fallback implementation. We determine which version to use by |
25 | // testing for the presence of the required headers. |
26 | // |
27 | // TODO: Maybe this should be done by the build system.... |
28 | #if !FOLLY_SSE_PREREQ(4, 2) |
29 | namespace folly { |
30 | namespace detail { |
31 | size_t qfind_first_byte_of_sse42( |
32 | const StringPieceLite haystack, |
33 | const StringPieceLite needles) { |
34 | return qfind_first_byte_of_nosse(haystack, needles); |
35 | } |
36 | } // namespace detail |
37 | } // namespace folly |
38 | #else |
39 | #include <cstdint> |
40 | #include <limits> |
41 | #include <string> |
42 | |
43 | #include <emmintrin.h> |
44 | #include <nmmintrin.h> |
45 | #include <smmintrin.h> |
46 | |
47 | #include <folly/Likely.h> |
48 | #include <folly/detail/Sse.h> |
49 | |
50 | namespace folly { |
51 | namespace detail { |
52 | |
53 | // It's okay if pages are bigger than this (as powers of two), but they should |
54 | // not be smaller. |
55 | static constexpr size_t kMinPageSize = 4096; |
56 | static_assert( |
57 | kMinPageSize >= 16, |
58 | "kMinPageSize must be at least SSE register size" ); |
59 | |
60 | template <typename T> |
61 | static inline uintptr_t page_for(T* addr) { |
62 | return reinterpret_cast<uintptr_t>(addr) / kMinPageSize; |
63 | } |
64 | |
65 | static inline size_t nextAlignedIndex(const char* arr) { |
66 | auto firstPossible = reinterpret_cast<uintptr_t>(arr) + 1; |
67 | return 1 + // add 1 because the index starts at 'arr' |
68 | ((firstPossible + 15) & ~0xF) // round up to next multiple of 16 |
69 | - firstPossible; |
70 | } |
71 | |
72 | // helper method for case where needles.size() <= 16 |
73 | size_t qfind_first_byte_of_needles16( |
74 | const StringPieceLite haystack, |
75 | const StringPieceLite needles) { |
76 | DCHECK_GT(haystack.size(), 0u); |
77 | DCHECK_GT(needles.size(), 0u); |
78 | DCHECK_LE(needles.size(), 16u); |
79 | if ((needles.size() <= 2 && haystack.size() >= 256) || |
80 | // must bail if we can't even SSE-load a single segment of haystack |
81 | (haystack.size() < 16 && |
82 | page_for(haystack.end() - 1) != page_for(haystack.data() + 15)) || |
83 | // can't load needles into SSE register if it could cross page boundary |
84 | page_for(needles.end() - 1) != page_for(needles.data() + 15)) { |
85 | return detail::qfind_first_byte_of_nosse(haystack, needles); |
86 | } |
87 | |
88 | auto arr2 = _mm_loadu_si128_unchecked( |
89 | reinterpret_cast<const __m128i*>(needles.data())); |
90 | // do an unaligned load for first block of haystack |
91 | auto arr1 = _mm_loadu_si128_unchecked( |
92 | reinterpret_cast<const __m128i*>(haystack.data())); |
93 | auto index = |
94 | _mm_cmpestri(arr2, int(needles.size()), arr1, int(haystack.size()), 0); |
95 | if (index < 16) { |
96 | return size_t(index); |
97 | } |
98 | |
99 | // Now, we can do aligned loads hereafter... |
100 | size_t i = nextAlignedIndex(haystack.data()); |
101 | for (; i < haystack.size(); i += 16) { |
102 | arr1 = _mm_load_si128_unchecked( |
103 | reinterpret_cast<const __m128i*>(haystack.data() + i)); |
104 | index = _mm_cmpestri( |
105 | arr2, int(needles.size()), arr1, int(haystack.size() - i), 0); |
106 | if (index < 16) { |
107 | return i + index; |
108 | } |
109 | } |
110 | return std::string::npos; |
111 | } |
112 | |
113 | // Scans a 16-byte block of haystack (starting at blockStartIdx) to find first |
114 | // needle. If HAYSTACK_ALIGNED, then haystack must be 16byte aligned. |
115 | // If !HAYSTACK_ALIGNED, then caller must ensure that it is safe to load the |
116 | // block. |
117 | template <bool HAYSTACK_ALIGNED> |
118 | size_t scanHaystackBlock( |
119 | const StringPieceLite haystack, |
120 | const StringPieceLite needles, |
121 | uint64_t blockStartIdx) { |
122 | DCHECK_GT(needles.size(), 16u); // should handled by *needles16() method |
123 | DCHECK( |
124 | blockStartIdx + 16 <= haystack.size() || |
125 | (page_for(haystack.data() + blockStartIdx) == |
126 | page_for(haystack.data() + blockStartIdx + 15))); |
127 | |
128 | __m128i arr1; |
129 | if (HAYSTACK_ALIGNED) { |
130 | arr1 = _mm_load_si128_unchecked( |
131 | reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx)); |
132 | } else { |
133 | arr1 = _mm_loadu_si128_unchecked( |
134 | reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx)); |
135 | } |
136 | |
137 | // This load is safe because needles.size() >= 16 |
138 | auto arr2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needles.data())); |
139 | auto b = |
140 | _mm_cmpestri(arr2, 16, arr1, int(haystack.size() - blockStartIdx), 0); |
141 | |
142 | size_t j = nextAlignedIndex(needles.data()); |
143 | for (; j < needles.size(); j += 16) { |
144 | arr2 = _mm_load_si128_unchecked( |
145 | reinterpret_cast<const __m128i*>(needles.data() + j)); |
146 | |
147 | auto index = _mm_cmpestri( |
148 | arr2, |
149 | int(needles.size() - j), |
150 | arr1, |
151 | int(haystack.size() - blockStartIdx), |
152 | 0); |
153 | b = std::min(index, b); |
154 | } |
155 | |
156 | if (b < 16) { |
157 | return blockStartIdx + b; |
158 | } |
159 | return std::string::npos; |
160 | } |
161 | |
162 | size_t qfind_first_byte_of_sse42( |
163 | const StringPieceLite haystack, |
164 | const StringPieceLite needles); |
165 | |
166 | size_t qfind_first_byte_of_sse42( |
167 | const StringPieceLite haystack, |
168 | const StringPieceLite needles) { |
169 | if (UNLIKELY(needles.empty() || haystack.empty())) { |
170 | return std::string::npos; |
171 | } else if (needles.size() <= 16) { |
172 | // we can save some unnecessary load instructions by optimizing for |
173 | // the common case of needles.size() <= 16 |
174 | return qfind_first_byte_of_needles16(haystack, needles); |
175 | } |
176 | |
177 | if (haystack.size() < 16 && |
178 | page_for(haystack.end() - 1) != page_for(haystack.data() + 16)) { |
179 | // We can't safely SSE-load haystack. Use a different approach. |
180 | if (haystack.size() <= 2) { |
181 | return qfind_first_byte_of_std(haystack, needles); |
182 | } |
183 | return qfind_first_byte_of_byteset(haystack, needles); |
184 | } |
185 | |
186 | auto ret = scanHaystackBlock<false>(haystack, needles, 0); |
187 | if (ret != std::string::npos) { |
188 | return ret; |
189 | } |
190 | |
191 | size_t i = nextAlignedIndex(haystack.data()); |
192 | for (; i < haystack.size(); i += 16) { |
193 | ret = scanHaystackBlock<true>(haystack, needles, i); |
194 | if (ret != std::string::npos) { |
195 | return ret; |
196 | } |
197 | } |
198 | |
199 | return std::string::npos; |
200 | } |
201 | } // namespace detail |
202 | } // namespace folly |
203 | #endif |
204 | |