1#pragma once
2
3#include <cstdint>
4#include <algorithm>
5
6#include <Core/Defines.h>
7
8
9namespace detail
10{
11
12template <typename T>
13inline int cmp(T a, T b)
14{
15 if (a < b)
16 return -1;
17 if (a > b)
18 return 1;
19 return 0;
20}
21
22}
23
24
25/// We can process uninitialized memory in the functions below.
26/// Results don't depend on the values inside uninitialized memory but Memory Sanitizer cannot see it.
27/// Disable optimized functions if compile with Memory Sanitizer.
28
29#if defined(__SSE2__) && !defined(MEMORY_SANITIZER)
30#include <emmintrin.h>
31
32
33/** All functions works under the following assumptions:
34 * - it's possible to read up to 15 excessive bytes after end of 'a' and 'b' region;
35 * - memory regions are relatively small and extra loop unrolling is not worth to do.
36 */
37
38/** Variant when memory regions may have different sizes.
39 */
40template <typename Char>
41inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size)
42{
43 size_t min_size = std::min(a_size, b_size);
44
45 for (size_t offset = 0; offset < min_size; offset += 16)
46 {
47 uint16_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(
48 _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
49 _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset))));
50 mask = ~mask;
51
52 if (mask)
53 {
54 offset += __builtin_ctz(mask);
55
56 if (offset >= min_size)
57 break;
58
59 return detail::cmp(a[offset], b[offset]);
60 }
61 }
62
63 return detail::cmp(a_size, b_size);
64}
65
66
67/** Variant when memory regions have same size.
68 * TODO Check if the compiler can optimize previous function when the caller pass identical sizes.
69 */
70template <typename Char>
71inline int memcmpSmallAllowOverflow15(const Char * a, const Char * b, size_t size)
72{
73 for (size_t offset = 0; offset < size; offset += 16)
74 {
75 uint16_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(
76 _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
77 _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset))));
78 mask = ~mask;
79
80 if (mask)
81 {
82 offset += __builtin_ctz(mask);
83
84 if (offset >= size)
85 return 0;
86
87 return detail::cmp(a[offset], b[offset]);
88 }
89 }
90
91 return 0;
92}
93
94
95/** Compare memory regions for equality.
96 */
97template <typename Char>
98inline bool memequalSmallAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size)
99{
100 if (a_size != b_size)
101 return false;
102
103 for (size_t offset = 0; offset < a_size; offset += 16)
104 {
105 uint16_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(
106 _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
107 _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset))));
108 mask = ~mask;
109
110 if (mask)
111 {
112 offset += __builtin_ctz(mask);
113 return offset >= a_size;
114 }
115 }
116
117 return true;
118}
119
120
121/** Variant when the caller know in advance that the size is a multiple of 16.
122 */
123template <typename Char>
124inline int memcmpSmallMultipleOf16(const Char * a, const Char * b, size_t size)
125{
126 for (size_t offset = 0; offset < size; offset += 16)
127 {
128 uint16_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(
129 _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)),
130 _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset))));
131 mask = ~mask;
132
133 if (mask)
134 {
135 offset += __builtin_ctz(mask);
136 return detail::cmp(a[offset], b[offset]);
137 }
138 }
139
140 return 0;
141}
142
143
144/** Variant when the size is 16 exactly.
145 */
146template <typename Char>
147inline int memcmp16(const Char * a, const Char * b)
148{
149 uint16_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(
150 _mm_loadu_si128(reinterpret_cast<const __m128i *>(a)),
151 _mm_loadu_si128(reinterpret_cast<const __m128i *>(b))));
152 mask = ~mask;
153
154 if (mask)
155 {
156 auto offset = __builtin_ctz(mask);
157 return detail::cmp(a[offset], b[offset]);
158 }
159
160 return 0;
161}
162
163
164/** Variant when the size is 16 exactly.
165 */
166inline bool memequal16(const void * a, const void * b)
167{
168 return 0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8(
169 _mm_loadu_si128(reinterpret_cast<const __m128i *>(a)),
170 _mm_loadu_si128(reinterpret_cast<const __m128i *>(b))));
171}
172
173
174/** Compare memory region to zero */
175inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size)
176{
177 const __m128i zero16 = _mm_setzero_si128();
178
179 for (size_t offset = 0; offset < size; offset += 16)
180 {
181 uint16_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(zero16,
182 _mm_loadu_si128(reinterpret_cast<const __m128i *>(reinterpret_cast<const char *>(data) + offset))));
183 mask = ~mask;
184
185 if (mask)
186 {
187 offset += __builtin_ctz(mask);
188 return offset >= size;
189 }
190 }
191
192 return true;
193}
194
195
196#else
197
198#include <cstring>
199
200template <typename Char>
201inline int memcmpSmallAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size)
202{
203 if (auto res = memcmp(a, b, std::min(a_size, b_size)))
204 return res;
205 else
206 return detail::cmp(a_size, b_size);
207}
208
209template <typename Char>
210inline int memcmpSmallAllowOverflow15(const Char * a, const Char * b, size_t size)
211{
212 return memcmp(a, b, size);
213}
214
215template <typename Char>
216inline bool memequalSmallAllowOverflow15(const Char * a, size_t a_size, const Char * b, size_t b_size)
217{
218 return a_size == b_size && 0 == memcmp(a, b, a_size);
219}
220
221template <typename Char>
222inline int memcmpSmallMultipleOf16(const Char * a, const Char * b, size_t size)
223{
224 return memcmp(a, b, size);
225}
226
227template <typename Char>
228inline int memcmp16(const Char * a, const Char * b)
229{
230 return memcmp(a, b, 16);
231}
232
233inline bool memequal16(const void * a, const void * b)
234{
235 return 0 == memcmp(a, b, 16);
236}
237
238inline bool memoryIsZeroSmallAllowOverflow15(const void * data, size_t size)
239{
240 const char * pos = reinterpret_cast<const char *>(data);
241 const char * end = pos + size;
242
243 for (; pos < end; ++pos)
244 if (*pos)
245 return false;
246
247 return true;
248}
249
250#endif
251