1/*
2 * Copyright (c) 2015-2017, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#ifndef MASKED_MOVE_H
30#define MASKED_MOVE_H
31
32#include "arch.h"
33
34#if defined(HAVE_AVX2)
35
36#include "unaligned.h"
37#include "simd_utils.h"
38
39#ifdef __cplusplus
40extern "C" {
41#endif
42extern const u32 mm_mask_mask[16];
43extern const u32 mm_shuffle_end[32][8];
44#ifdef __cplusplus
45}
46#endif
47
48/* load mask for len bytes from start of buffer */
49static really_inline m256
50_get_mm_mask_end(u32 len) {
51 assert(len <= 32);
52 const u8 *masky = (const u8 *)mm_mask_mask;
53 m256 mask = load256(masky + 32);
54 mask = _mm256_sll_epi32(mask, _mm_cvtsi32_si128(8 - (len >> 2)));
55 return mask;
56}
57
58/*
59 * masked_move256_len: Will load len bytes from *buf into m256
60 * _______________________________
61 * |0<----len---->| 32|
62 * -------------------------------
63 */
64static really_inline m256
65masked_move256_len(const u8 *buf, const u32 len) {
66 assert(len >= 4);
67
68 m256 lmask = _get_mm_mask_end(len);
69
70 u32 end = unaligned_load_u32(buf + len - 4);
71 m256 preshufend = _mm256_broadcastq_epi64(_mm_cvtsi32_si128(end));
72 m256 v = _mm256_maskload_epi32((const int *)buf, lmask);
73 m256 shufend = pshufb_m256(preshufend,
74 loadu256(&mm_shuffle_end[len - 4]));
75 m256 target = or256(v, shufend);
76
77 return target;
78}
79
80#endif /* AVX2 */
81#endif /* MASKED_MOVE_H */
82
83