1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Naive dynamic shuffles.
31 *
32 * These are written with the assumption that the provided masks are sparsely
33 * populated and never contain more than 32 on bits. Other implementations will
34 * be faster and actually correct if these assumptions don't hold true.
35 */
36
37#ifndef LIMEX_SHUFFLE_H
38#define LIMEX_SHUFFLE_H
39
40#include "ue2common.h"
41#include "util/arch.h"
42#include "util/bitutils.h"
43#include "util/simd_utils.h"
44
45static really_inline
46u32 packedExtract128(m128 s, const m128 permute, const m128 compare) {
47 m128 shuffled = pshufb_m128(s, permute);
48 m128 compared = and128(shuffled, compare);
49 u16 rv = ~movemask128(eq128(compared, shuffled));
50 return (u32)rv;
51}
52
53#if defined(HAVE_AVX2)
54static really_inline
55u32 packedExtract256(m256 s, const m256 permute, const m256 compare) {
56 // vpshufb doesn't cross lanes, so this is a bit of a cheat
57 m256 shuffled = pshufb_m256(s, permute);
58 m256 compared = and256(shuffled, compare);
59 u32 rv = ~movemask256(eq256(compared, shuffled));
60 // stitch the lane-wise results back together
61 return (u32)((rv >> 16) | (rv & 0xffffU));
62}
63#endif // AVX2
64
65#if defined(HAVE_AVX512)
66static really_inline
67u32 packedExtract512(m512 s, const m512 permute, const m512 compare) {
68 // vpshufb doesn't cross lanes, so this is a bit of a cheat
69 m512 shuffled = pshufb_m512(s, permute);
70 m512 compared = and512(shuffled, compare);
71 u64a rv = ~eq512mask(compared, shuffled);
72 // stitch the lane-wise results back together
73 rv = rv >> 32 | rv;
74 return (u32)(((rv >> 16) | rv) & 0xffffU);
75}
76#endif // AVX512
77
78#endif // LIMEX_SHUFFLE_H
79