1/*
2 * Copyright 2015-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <folly/detail/RangeSse42.h>
18
19#include <glog/logging.h>
20
21#include <folly/Portability.h>
22
23// Essentially, two versions of this file: one with an SSE42 implementation
24// and one with a fallback implementation. We determine which version to use by
25// testing for the presence of the required headers.
26//
27// TODO: Maybe this should be done by the build system....
28#if !FOLLY_SSE_PREREQ(4, 2)
29namespace folly {
30namespace detail {
31size_t qfind_first_byte_of_sse42(
32 const StringPieceLite haystack,
33 const StringPieceLite needles) {
34 return qfind_first_byte_of_nosse(haystack, needles);
35}
36} // namespace detail
37} // namespace folly
38#else
39#include <cstdint>
40#include <limits>
41#include <string>
42
43#include <emmintrin.h>
44#include <nmmintrin.h>
45#include <smmintrin.h>
46
47#include <folly/Likely.h>
48#include <folly/detail/Sse.h>
49
50namespace folly {
51namespace detail {
52
53// It's okay if pages are bigger than this (as powers of two), but they should
54// not be smaller.
55static constexpr size_t kMinPageSize = 4096;
56static_assert(
57 kMinPageSize >= 16,
58 "kMinPageSize must be at least SSE register size");
59
60template <typename T>
61static inline uintptr_t page_for(T* addr) {
62 return reinterpret_cast<uintptr_t>(addr) / kMinPageSize;
63}
64
65static inline size_t nextAlignedIndex(const char* arr) {
66 auto firstPossible = reinterpret_cast<uintptr_t>(arr) + 1;
67 return 1 + // add 1 because the index starts at 'arr'
68 ((firstPossible + 15) & ~0xF) // round up to next multiple of 16
69 - firstPossible;
70}
71
72// helper method for case where needles.size() <= 16
73size_t qfind_first_byte_of_needles16(
74 const StringPieceLite haystack,
75 const StringPieceLite needles) {
76 DCHECK_GT(haystack.size(), 0u);
77 DCHECK_GT(needles.size(), 0u);
78 DCHECK_LE(needles.size(), 16u);
79 if ((needles.size() <= 2 && haystack.size() >= 256) ||
80 // must bail if we can't even SSE-load a single segment of haystack
81 (haystack.size() < 16 &&
82 page_for(haystack.end() - 1) != page_for(haystack.data() + 15)) ||
83 // can't load needles into SSE register if it could cross page boundary
84 page_for(needles.end() - 1) != page_for(needles.data() + 15)) {
85 return detail::qfind_first_byte_of_nosse(haystack, needles);
86 }
87
88 auto arr2 = _mm_loadu_si128_unchecked(
89 reinterpret_cast<const __m128i*>(needles.data()));
90 // do an unaligned load for first block of haystack
91 auto arr1 = _mm_loadu_si128_unchecked(
92 reinterpret_cast<const __m128i*>(haystack.data()));
93 auto index =
94 _mm_cmpestri(arr2, int(needles.size()), arr1, int(haystack.size()), 0);
95 if (index < 16) {
96 return size_t(index);
97 }
98
99 // Now, we can do aligned loads hereafter...
100 size_t i = nextAlignedIndex(haystack.data());
101 for (; i < haystack.size(); i += 16) {
102 arr1 = _mm_load_si128_unchecked(
103 reinterpret_cast<const __m128i*>(haystack.data() + i));
104 index = _mm_cmpestri(
105 arr2, int(needles.size()), arr1, int(haystack.size() - i), 0);
106 if (index < 16) {
107 return i + index;
108 }
109 }
110 return std::string::npos;
111}
112
113// Scans a 16-byte block of haystack (starting at blockStartIdx) to find first
114// needle. If HAYSTACK_ALIGNED, then haystack must be 16byte aligned.
115// If !HAYSTACK_ALIGNED, then caller must ensure that it is safe to load the
116// block.
117template <bool HAYSTACK_ALIGNED>
118size_t scanHaystackBlock(
119 const StringPieceLite haystack,
120 const StringPieceLite needles,
121 uint64_t blockStartIdx) {
122 DCHECK_GT(needles.size(), 16u); // should handled by *needles16() method
123 DCHECK(
124 blockStartIdx + 16 <= haystack.size() ||
125 (page_for(haystack.data() + blockStartIdx) ==
126 page_for(haystack.data() + blockStartIdx + 15)));
127
128 __m128i arr1;
129 if (HAYSTACK_ALIGNED) {
130 arr1 = _mm_load_si128_unchecked(
131 reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx));
132 } else {
133 arr1 = _mm_loadu_si128_unchecked(
134 reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx));
135 }
136
137 // This load is safe because needles.size() >= 16
138 auto arr2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needles.data()));
139 auto b =
140 _mm_cmpestri(arr2, 16, arr1, int(haystack.size() - blockStartIdx), 0);
141
142 size_t j = nextAlignedIndex(needles.data());
143 for (; j < needles.size(); j += 16) {
144 arr2 = _mm_load_si128_unchecked(
145 reinterpret_cast<const __m128i*>(needles.data() + j));
146
147 auto index = _mm_cmpestri(
148 arr2,
149 int(needles.size() - j),
150 arr1,
151 int(haystack.size() - blockStartIdx),
152 0);
153 b = std::min(index, b);
154 }
155
156 if (b < 16) {
157 return blockStartIdx + b;
158 }
159 return std::string::npos;
160}
161
162size_t qfind_first_byte_of_sse42(
163 const StringPieceLite haystack,
164 const StringPieceLite needles);
165
166size_t qfind_first_byte_of_sse42(
167 const StringPieceLite haystack,
168 const StringPieceLite needles) {
169 if (UNLIKELY(needles.empty() || haystack.empty())) {
170 return std::string::npos;
171 } else if (needles.size() <= 16) {
172 // we can save some unnecessary load instructions by optimizing for
173 // the common case of needles.size() <= 16
174 return qfind_first_byte_of_needles16(haystack, needles);
175 }
176
177 if (haystack.size() < 16 &&
178 page_for(haystack.end() - 1) != page_for(haystack.data() + 16)) {
179 // We can't safely SSE-load haystack. Use a different approach.
180 if (haystack.size() <= 2) {
181 return qfind_first_byte_of_std(haystack, needles);
182 }
183 return qfind_first_byte_of_byteset(haystack, needles);
184 }
185
186 auto ret = scanHaystackBlock<false>(haystack, needles, 0);
187 if (ret != std::string::npos) {
188 return ret;
189 }
190
191 size_t i = nextAlignedIndex(haystack.data());
192 for (; i < haystack.size(); i += 16) {
193 ret = scanHaystackBlock<true>(haystack, needles, i);
194 if (ret != std::string::npos) {
195 return ret;
196 }
197 }
198
199 return std::string::npos;
200}
201} // namespace detail
202} // namespace folly
203#endif
204