1/*
2 * Copyright (c) 2016-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#ifndef VALIDATE_SHUFTI_H
30#define VALIDATE_SHUFTI_H
31
32#include "ue2common.h"
33#include "util/simd_utils.h"
34
35#if defined(DEBUG)
36static
37void dumpMask(const void *mask, int len) {
38 const u8 *c = (const u8 *)mask;
39 for (int i = 0; i < len; i++) {
40 printf("%02x", c[i]);
41 }
42 printf("\n");
43}
44#endif
45
46static really_inline
47int validateShuftiMask16x16(const m256 data, const m256 hi_mask,
48 const m256 lo_mask, const m256 and_mask,
49 const u32 neg_mask, const u32 valid_data_mask) {
50 m256 low4bits = set32x8(0xf);
51 m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
52 m256 c_hi = pshufb_m256(hi_mask,
53 rshift64_m256(andnot256(low4bits, data), 4));
54 m256 t = and256(c_lo, c_hi);
55 u32 nresult = movemask256(eq256(and256(t, and_mask), zeroes256()));
56#ifdef DEBUG
57 DEBUG_PRINTF("data\n");
58 dumpMask(&data, 32);
59 DEBUG_PRINTF("hi_mask\n");
60 dumpMask(&hi_mask, 32);
61 DEBUG_PRINTF("lo_mask\n");
62 dumpMask(&lo_mask, 32);
63 DEBUG_PRINTF("c_lo\n");
64 dumpMask(&c_lo, 32);
65 DEBUG_PRINTF("c_hi\n");
66 dumpMask(&c_hi, 32);
67 DEBUG_PRINTF("and_mask\n");
68 dumpMask(&and_mask, 32);
69 DEBUG_PRINTF("nresult %x\n", nresult);
70 DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
71#endif
72 u32 cmp_result = (((nresult >> 16) & nresult) ^ neg_mask) & valid_data_mask;
73 return !cmp_result;
74}
75
76static really_inline
77int validateShuftiMask16x8(const m128 data, const m256 nib_mask,
78 const m128 and_mask, const u32 neg_mask,
79 const u32 valid_data_mask) {
80 m256 data_m256 = combine2x128(rshift64_m128(data, 4), data);
81 m256 low4bits = set32x8(0xf);
82 m256 c_nib = pshufb_m256(nib_mask, and256(data_m256, low4bits));
83 m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
84 m128 nresult = eq128(and128(t, and_mask), zeroes128());
85#ifdef DEBUG
86 DEBUG_PRINTF("data\n");
87 dumpMask(&data_m256, 32);
88 DEBUG_PRINTF("nib_mask\n");
89 dumpMask(&nib_mask, 32);
90 DEBUG_PRINTF("c_nib\n");
91 dumpMask(&c_nib, 32);
92 DEBUG_PRINTF("nresult\n");
93 dumpMask(&nresult, 16);
94 DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
95#endif
96 u32 cmp_result = (movemask128(nresult) ^ neg_mask) & valid_data_mask;
97 return !cmp_result;
98}
99
100static really_inline
101int validateShuftiMask32x8(const m256 data, const m256 hi_mask,
102 const m256 lo_mask, const m256 and_mask,
103 const u32 neg_mask, const u32 valid_data_mask) {
104 m256 low4bits = set32x8(0xf);
105 m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
106 m256 c_hi = pshufb_m256(hi_mask,
107 rshift64_m256(andnot256(low4bits, data), 4));
108 m256 t = and256(c_lo, c_hi);
109 m256 nresult = eq256(and256(t, and_mask), zeroes256());
110#ifdef DEBUG
111 DEBUG_PRINTF("data\n");
112 dumpMask(&data, 32);
113 DEBUG_PRINTF("hi_mask\n");
114 dumpMask(&hi_mask, 32);
115 DEBUG_PRINTF("lo_mask\n");
116 dumpMask(&lo_mask, 32);
117 DEBUG_PRINTF("c_lo\n");
118 dumpMask(&c_lo, 32);
119 DEBUG_PRINTF("c_hi\n");
120 dumpMask(&c_hi, 32);
121 DEBUG_PRINTF("nresult\n");
122 dumpMask(&nresult, 32);
123 DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
124#endif
125 u32 cmp_result = (movemask256(nresult) ^ neg_mask) & valid_data_mask;
126 return !cmp_result;
127}
128
129static really_inline
130int validateShuftiMask32x16(const m256 data,
131 const m256 hi_mask_1, const m256 hi_mask_2,
132 const m256 lo_mask_1, const m256 lo_mask_2,
133 const m256 bucket_mask_hi,
134 const m256 bucket_mask_lo, const u32 neg_mask,
135 const u32 valid_data_mask) {
136 m256 low4bits = set32x8(0xf);
137 m256 data_lo = and256(data, low4bits);
138 m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
139 m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
140 m256 c_lo_2 = pshufb_m256(lo_mask_2, data_lo);
141 m256 c_hi_1 = pshufb_m256(hi_mask_1, data_hi);
142 m256 c_hi_2 = pshufb_m256(hi_mask_2, data_hi);
143 m256 t1 = and256(c_lo_1, c_hi_1);
144 m256 t2 = and256(c_lo_2, c_hi_2);
145 m256 result = or256(and256(t1, bucket_mask_lo), and256(t2, bucket_mask_hi));
146 u32 nresult = movemask256(eq256(result, zeroes256()));
147#ifdef DEBUG
148 DEBUG_PRINTF("data\n");
149 dumpMask(&data, 32);
150 DEBUG_PRINTF("data_lo\n");
151 dumpMask(&data_lo, 32);
152 DEBUG_PRINTF("data_hi\n");
153 dumpMask(&data_hi, 32);
154 DEBUG_PRINTF("hi_mask_1\n");
155 dumpMask(&hi_mask_1, 16);
156 DEBUG_PRINTF("hi_mask_2\n");
157 dumpMask(&hi_mask_2, 16);
158 DEBUG_PRINTF("lo_mask_1\n");
159 dumpMask(&lo_mask_1, 16);
160 DEBUG_PRINTF("lo_mask_2\n");
161 dumpMask(&lo_mask_2, 16);
162 DEBUG_PRINTF("c_lo_1\n");
163 dumpMask(&c_lo_1, 32);
164 DEBUG_PRINTF("c_lo_2\n");
165 dumpMask(&c_lo_2, 32);
166 DEBUG_PRINTF("c_hi_1\n");
167 dumpMask(&c_hi_1, 32);
168 DEBUG_PRINTF("c_hi_2\n");
169 dumpMask(&c_hi_2, 32);
170 DEBUG_PRINTF("result\n");
171 dumpMask(&result, 32);
172 DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
173#endif
174 u32 cmp_result = (nresult ^ neg_mask) & valid_data_mask;
175 return !cmp_result;
176}
177
178static really_inline
179int checkMultipath32(u32 data, u32 hi_bits, u32 lo_bits) {
180 u32 t = ~(data | hi_bits);
181 t += lo_bits;
182 t &= (~data) & hi_bits;
183 DEBUG_PRINTF("t %x\n", t);
184 return !!t;
185}
186
187static really_inline
188int checkMultipath64(u64a data, u64a hi_bits, u64a lo_bits) {
189 u64a t = ~(data | hi_bits);
190 t += lo_bits;
191 t &= (~data) & hi_bits;
192 DEBUG_PRINTF("t %llx\n", t);
193 return !!t;
194}
195
196static really_inline
197int validateMultipathShuftiMask16x8(const m128 data,
198 const m256 nib_mask,
199 const m128 bucket_select_mask,
200 const u32 hi_bits, const u32 lo_bits,
201 const u32 neg_mask,
202 const u32 valid_path_mask) {
203 m256 data_256 = combine2x128(rshift64_m128(data, 4), data);
204 m256 low4bits = set32x8(0xf);
205 m256 c_nib = pshufb_m256(nib_mask, and256(data_256, low4bits));
206 m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
207 m128 result = and128(t, bucket_select_mask);
208 u32 nresult = movemask128(eq128(result, zeroes128()));
209 u32 cmp_result = (nresult ^ neg_mask) | valid_path_mask;
210
211 DEBUG_PRINTF("cmp_result %x\n", cmp_result);
212
213 return checkMultipath32(cmp_result, hi_bits, lo_bits);
214}
215
216static really_inline
217int validateMultipathShuftiMask32x8(const m256 data,
218 const m256 hi_mask, const m256 lo_mask,
219 const m256 bucket_select_mask,
220 const u32 hi_bits, const u32 lo_bits,
221 const u32 neg_mask,
222 const u32 valid_path_mask) {
223 m256 low4bits = set32x8(0xf);
224 m256 data_lo = and256(data, low4bits);
225 m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
226 m256 c_lo = pshufb_m256(lo_mask, data_lo);
227 m256 c_hi = pshufb_m256(hi_mask, data_hi);
228 m256 c = and256(c_lo, c_hi);
229 m256 result = and256(c, bucket_select_mask);
230 u32 nresult = movemask256(eq256(result, zeroes256()));
231 u32 cmp_result = (nresult ^ neg_mask) | valid_path_mask;
232
233 DEBUG_PRINTF("cmp_result %x\n", cmp_result);
234
235 return checkMultipath32(cmp_result, hi_bits, lo_bits);
236}
237
238static really_inline
239int validateMultipathShuftiMask32x16(const m256 data,
240 const m256 hi_mask_1, const m256 hi_mask_2,
241 const m256 lo_mask_1, const m256 lo_mask_2,
242 const m256 bucket_select_mask_hi,
243 const m256 bucket_select_mask_lo,
244 const u32 hi_bits, const u32 lo_bits,
245 const u32 neg_mask,
246 const u32 valid_path_mask) {
247 m256 low4bits = set32x8(0xf);
248 m256 data_lo = and256(data, low4bits);
249 m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
250 m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
251 m256 c_lo_2 = pshufb_m256(lo_mask_2, data_lo);
252 m256 c_hi_1 = pshufb_m256(hi_mask_1, data_hi);
253 m256 c_hi_2 = pshufb_m256(hi_mask_2, data_hi);
254 m256 t1 = and256(c_lo_1, c_hi_1);
255 m256 t2 = and256(c_lo_2, c_hi_2);
256 m256 result = or256(and256(t1, bucket_select_mask_lo),
257 and256(t2, bucket_select_mask_hi));
258 u32 nresult = movemask256(eq256(result, zeroes256()));
259 u32 cmp_result = (nresult ^ neg_mask) | valid_path_mask;
260
261 DEBUG_PRINTF("cmp_result %x\n", cmp_result);
262
263 return checkMultipath32(cmp_result, hi_bits, lo_bits);
264}
265
266static really_inline
267int validateMultipathShuftiMask64(const m256 data_1, const m256 data_2,
268 const m256 hi_mask, const m256 lo_mask,
269 const m256 bucket_select_mask_1,
270 const m256 bucket_select_mask_2,
271 const u64a hi_bits, const u64a lo_bits,
272 const u64a neg_mask,
273 const u64a valid_path_mask) {
274 m256 low4bits = set32x8(0xf);
275 m256 c_lo_1 = pshufb_m256(lo_mask, and256(data_1, low4bits));
276 m256 c_lo_2 = pshufb_m256(lo_mask, and256(data_2, low4bits));
277 m256 c_hi_1 = pshufb_m256(hi_mask,
278 rshift64_m256(andnot256(low4bits, data_1), 4));
279 m256 c_hi_2 = pshufb_m256(hi_mask,
280 rshift64_m256(andnot256(low4bits, data_2), 4));
281 m256 t1 = and256(c_lo_1, c_hi_1);
282 m256 t2 = and256(c_lo_2, c_hi_2);
283 m256 nresult_1 = eq256(and256(t1, bucket_select_mask_1), zeroes256());
284 m256 nresult_2 = eq256(and256(t2, bucket_select_mask_2), zeroes256());
285 u64a nresult = (u64a)movemask256(nresult_1) |
286 (u64a)movemask256(nresult_2) << 32;
287 u64a cmp_result = (nresult ^ neg_mask) | valid_path_mask;
288
289 DEBUG_PRINTF("cmp_result %llx\n", cmp_result);
290
291 return checkMultipath64(cmp_result, hi_bits, lo_bits);
292}
293
294#endif
295