1#include <assert.h>
2#include <stdbool.h>
3#include <stdint.h>
4#include <stdio.h>
5#include <stdlib.h>
6#include <string.h>
7
8#include <roaring/array_util.h>
9#include <roaring/portability.h>
10#include <roaring/utilasm.h>
11extern inline int32_t binarySearch(const uint16_t *array, int32_t lenarray,
12 uint16_t ikey);
13
14#ifdef USESSE4
15// used by intersect_vector16
16ALIGNED(0x1000)
17static const uint8_t shuffle_mask16[] = {
18 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
19 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
20 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 0xFF, 0xFF,
21 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
22 0, 1, 2, 3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
23 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
24 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5,
25 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
26 2, 3, 4, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
27 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 0xFF, 0xFF,
28 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 0xFF, 0xFF,
29 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
30 0, 1, 6, 7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
31 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7, 0xFF, 0xFF, 0xFF, 0xFF,
32 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
33 6, 7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
34 4, 5, 6, 7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
35 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 6, 7, 0xFF, 0xFF,
36 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5,
37 6, 7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
38 0, 1, 2, 3, 4, 5, 6, 7, 0xFF, 0xFF, 0xFF, 0xFF,
39 0xFF, 0xFF, 0xFF, 0xFF, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
40 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 8, 9,
41 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
42 2, 3, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
43 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 8, 9, 0xFF, 0xFF,
44 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 8, 9,
45 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
46 0, 1, 4, 5, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
47 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 8, 9, 0xFF, 0xFF,
48 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
49 4, 5, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
50 6, 7, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
51 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7, 8, 9, 0xFF, 0xFF,
52 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7,
53 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
54 0, 1, 2, 3, 6, 7, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF,
55 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7, 8, 9, 0xFF, 0xFF,
56 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5,
57 6, 7, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
58 2, 3, 4, 5, 6, 7, 8, 9, 0xFF, 0xFF, 0xFF, 0xFF,
59 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 6, 7,
60 8, 9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 10, 11, 0xFF, 0xFF,
61 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
62 0, 1, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
63 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF,
64 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
65 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
66 4, 5, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
67 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 10, 11, 0xFF, 0xFF,
68 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5,
69 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
70 0, 1, 2, 3, 4, 5, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF,
71 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF,
72 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7,
73 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
74 2, 3, 6, 7, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
75 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 6, 7, 10, 11,
76 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7,
77 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
78 0, 1, 4, 5, 6, 7, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF,
79 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 6, 7, 10, 11,
80 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
81 4, 5, 6, 7, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
82 8, 9, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
83 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 8, 9, 10, 11, 0xFF, 0xFF,
84 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 8, 9,
85 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
86 0, 1, 2, 3, 8, 9, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF,
87 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 8, 9, 10, 11, 0xFF, 0xFF,
88 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5,
89 8, 9, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
90 2, 3, 4, 5, 8, 9, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF,
91 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 8, 9,
92 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 8, 9,
93 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
94 0, 1, 6, 7, 8, 9, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF,
95 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7, 8, 9, 10, 11,
96 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
97 6, 7, 8, 9, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
98 4, 5, 6, 7, 8, 9, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF,
99 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 6, 7, 8, 9,
100 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5,
101 6, 7, 8, 9, 10, 11, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
102 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
103 0xFF, 0xFF, 0xFF, 0xFF, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
104 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 12, 13,
105 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
106 2, 3, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
107 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 12, 13, 0xFF, 0xFF,
108 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 12, 13,
109 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
110 0, 1, 4, 5, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
111 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 12, 13, 0xFF, 0xFF,
112 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
113 4, 5, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
114 6, 7, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
115 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7, 12, 13, 0xFF, 0xFF,
116 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7,
117 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
118 0, 1, 2, 3, 6, 7, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF,
119 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7, 12, 13, 0xFF, 0xFF,
120 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5,
121 6, 7, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
122 2, 3, 4, 5, 6, 7, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF,
123 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 6, 7,
124 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 8, 9, 12, 13,
125 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
126 0, 1, 8, 9, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
127 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 8, 9, 12, 13, 0xFF, 0xFF,
128 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
129 8, 9, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
130 4, 5, 8, 9, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
131 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 8, 9, 12, 13,
132 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5,
133 8, 9, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
134 0, 1, 2, 3, 4, 5, 8, 9, 12, 13, 0xFF, 0xFF,
135 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 8, 9, 12, 13, 0xFF, 0xFF,
136 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7,
137 8, 9, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
138 2, 3, 6, 7, 8, 9, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF,
139 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 6, 7, 8, 9,
140 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7,
141 8, 9, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
142 0, 1, 4, 5, 6, 7, 8, 9, 12, 13, 0xFF, 0xFF,
143 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 6, 7, 8, 9,
144 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
145 4, 5, 6, 7, 8, 9, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF,
146 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
147 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 10, 11, 12, 13, 0xFF, 0xFF,
148 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 10, 11,
149 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
150 0, 1, 2, 3, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF,
151 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 10, 11, 12, 13, 0xFF, 0xFF,
152 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5,
153 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
154 2, 3, 4, 5, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF,
155 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 10, 11,
156 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 10, 11,
157 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
158 0, 1, 6, 7, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF,
159 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7, 10, 11, 12, 13,
160 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
161 6, 7, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
162 4, 5, 6, 7, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF,
163 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 6, 7, 10, 11,
164 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5,
165 6, 7, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
166 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13,
167 0xFF, 0xFF, 0xFF, 0xFF, 8, 9, 10, 11, 12, 13, 0xFF, 0xFF,
168 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 8, 9,
169 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
170 2, 3, 8, 9, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF,
171 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 8, 9, 10, 11,
172 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 8, 9,
173 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
174 0, 1, 4, 5, 8, 9, 10, 11, 12, 13, 0xFF, 0xFF,
175 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 8, 9, 10, 11,
176 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
177 4, 5, 8, 9, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF,
178 6, 7, 8, 9, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF,
179 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7, 8, 9, 10, 11,
180 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7,
181 8, 9, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
182 0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13,
183 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7, 8, 9, 10, 11,
184 12, 13, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5,
185 6, 7, 8, 9, 10, 11, 12, 13, 0xFF, 0xFF, 0xFF, 0xFF,
186 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
187 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 6, 7,
188 8, 9, 10, 11, 12, 13, 0xFF, 0xFF, 14, 15, 0xFF, 0xFF,
189 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
190 0, 1, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
191 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
192 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
193 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
194 4, 5, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
195 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 14, 15, 0xFF, 0xFF,
196 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5,
197 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
198 0, 1, 2, 3, 4, 5, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
199 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
200 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7,
201 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
202 2, 3, 6, 7, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
203 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 6, 7, 14, 15,
204 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7,
205 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
206 0, 1, 4, 5, 6, 7, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
207 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 6, 7, 14, 15,
208 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
209 4, 5, 6, 7, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
210 8, 9, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
211 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 8, 9, 14, 15, 0xFF, 0xFF,
212 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 8, 9,
213 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
214 0, 1, 2, 3, 8, 9, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
215 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 8, 9, 14, 15, 0xFF, 0xFF,
216 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5,
217 8, 9, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
218 2, 3, 4, 5, 8, 9, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
219 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 8, 9,
220 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 8, 9,
221 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
222 0, 1, 6, 7, 8, 9, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
223 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7, 8, 9, 14, 15,
224 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
225 6, 7, 8, 9, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
226 4, 5, 6, 7, 8, 9, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
227 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 6, 7, 8, 9,
228 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5,
229 6, 7, 8, 9, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
230 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 14, 15,
231 0xFF, 0xFF, 0xFF, 0xFF, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
232 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 10, 11,
233 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
234 2, 3, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
235 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 10, 11, 14, 15,
236 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 10, 11,
237 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
238 0, 1, 4, 5, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
239 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 10, 11, 14, 15,
240 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
241 4, 5, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
242 6, 7, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
243 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7, 10, 11, 14, 15,
244 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7,
245 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
246 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 0xFF, 0xFF,
247 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7, 10, 11, 14, 15,
248 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5,
249 6, 7, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
250 2, 3, 4, 5, 6, 7, 10, 11, 14, 15, 0xFF, 0xFF,
251 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 6, 7,
252 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 8, 9, 10, 11,
253 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
254 0, 1, 8, 9, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
255 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 8, 9, 10, 11, 14, 15,
256 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
257 8, 9, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
258 4, 5, 8, 9, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
259 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 8, 9, 10, 11,
260 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5,
261 8, 9, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
262 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 14, 15,
263 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 8, 9, 10, 11, 14, 15,
264 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7,
265 8, 9, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
266 2, 3, 6, 7, 8, 9, 10, 11, 14, 15, 0xFF, 0xFF,
267 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 6, 7, 8, 9,
268 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7,
269 8, 9, 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
270 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15,
271 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 6, 7, 8, 9,
272 10, 11, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
273 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 0xFF, 0xFF,
274 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
275 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 12, 13, 14, 15, 0xFF, 0xFF,
276 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 12, 13,
277 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
278 0, 1, 2, 3, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
279 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 12, 13, 14, 15, 0xFF, 0xFF,
280 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5,
281 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
282 2, 3, 4, 5, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
283 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 12, 13,
284 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 12, 13,
285 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
286 0, 1, 6, 7, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
287 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7, 12, 13, 14, 15,
288 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
289 6, 7, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
290 4, 5, 6, 7, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
291 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 6, 7, 12, 13,
292 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5,
293 6, 7, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
294 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
295 0xFF, 0xFF, 0xFF, 0xFF, 8, 9, 12, 13, 14, 15, 0xFF, 0xFF,
296 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 8, 9,
297 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
298 2, 3, 8, 9, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
299 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 8, 9, 12, 13,
300 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 8, 9,
301 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
302 0, 1, 4, 5, 8, 9, 12, 13, 14, 15, 0xFF, 0xFF,
303 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 8, 9, 12, 13,
304 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
305 4, 5, 8, 9, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
306 6, 7, 8, 9, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
307 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7, 8, 9, 12, 13,
308 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7,
309 8, 9, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
310 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 14, 15,
311 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7, 8, 9, 12, 13,
312 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5,
313 6, 7, 8, 9, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
314 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15,
315 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 6, 7,
316 8, 9, 12, 13, 14, 15, 0xFF, 0xFF, 10, 11, 12, 13,
317 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
318 0, 1, 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
319 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 10, 11, 12, 13, 14, 15,
320 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
321 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
322 4, 5, 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
323 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 10, 11, 12, 13,
324 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5,
325 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
326 0, 1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15,
327 0xFF, 0xFF, 0xFF, 0xFF, 6, 7, 10, 11, 12, 13, 14, 15,
328 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 6, 7,
329 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
330 2, 3, 6, 7, 10, 11, 12, 13, 14, 15, 0xFF, 0xFF,
331 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 6, 7, 10, 11,
332 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 6, 7,
333 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
334 0, 1, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15,
335 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 4, 5, 6, 7, 10, 11,
336 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
337 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 0xFF, 0xFF,
338 8, 9, 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
339 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 8, 9, 10, 11, 12, 13,
340 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 8, 9,
341 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
342 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
343 0xFF, 0xFF, 0xFF, 0xFF, 4, 5, 8, 9, 10, 11, 12, 13,
344 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5,
345 8, 9, 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF,
346 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 15,
347 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3, 4, 5, 8, 9,
348 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 6, 7, 8, 9,
349 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
350 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
351 0xFF, 0xFF, 0xFF, 0xFF, 2, 3, 6, 7, 8, 9, 10, 11,
352 12, 13, 14, 15, 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
353 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0xFF, 0xFF,
354 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
355 0xFF, 0xFF, 0xFF, 0xFF, 0, 1, 4, 5, 6, 7, 8, 9,
356 10, 11, 12, 13, 14, 15, 0xFF, 0xFF, 2, 3, 4, 5,
357 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0xFF, 0xFF,
358 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
359 12, 13, 14, 15};
360
361/**
362 * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions
363 * Optimized by D. Lemire on May 3rd 2013
364 */
365int32_t intersect_vector16(const uint16_t *__restrict__ A, size_t s_a,
366 const uint16_t *__restrict__ B, size_t s_b,
367 uint16_t *C) {
368 size_t count = 0;
369 size_t i_a = 0, i_b = 0;
370 const int vectorlength = sizeof(__m128i) / sizeof(uint16_t);
371 const size_t st_a = (s_a / vectorlength) * vectorlength;
372 const size_t st_b = (s_b / vectorlength) * vectorlength;
373 __m128i v_a, v_b;
374 if ((i_a < st_a) && (i_b < st_b)) {
375 v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
376 v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
377 while ((A[i_a] == 0) || (B[i_b] == 0)) {
378 const __m128i res_v = _mm_cmpestrm(
379 v_b, vectorlength, v_a, vectorlength,
380 _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
381 const int r = _mm_extract_epi32(res_v, 0);
382 __m128i sm16 = _mm_load_si128((const __m128i *)shuffle_mask16 + r);
383 __m128i p = _mm_shuffle_epi8(v_a, sm16);
384 _mm_storeu_si128((__m128i *)&C[count], p); // can overflow
385 count += _mm_popcnt_u32(r);
386 const uint16_t a_max = A[i_a + vectorlength - 1];
387 const uint16_t b_max = B[i_b + vectorlength - 1];
388 if (a_max <= b_max) {
389 i_a += vectorlength;
390 if (i_a == st_a) break;
391 v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
392 }
393 if (b_max <= a_max) {
394 i_b += vectorlength;
395 if (i_b == st_b) break;
396 v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
397 }
398 }
399 if ((i_a < st_a) && (i_b < st_b))
400 while (true) {
401 const __m128i res_v = _mm_cmpistrm(
402 v_b, v_a,
403 _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
404 const int r = _mm_extract_epi32(res_v, 0);
405 __m128i sm16 =
406 _mm_load_si128((const __m128i *)shuffle_mask16 + r);
407 __m128i p = _mm_shuffle_epi8(v_a, sm16);
408 _mm_storeu_si128((__m128i *)&C[count], p); // can overflow
409 count += _mm_popcnt_u32(r);
410 const uint16_t a_max = A[i_a + vectorlength - 1];
411 const uint16_t b_max = B[i_b + vectorlength - 1];
412 if (a_max <= b_max) {
413 i_a += vectorlength;
414 if (i_a == st_a) break;
415 v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
416 }
417 if (b_max <= a_max) {
418 i_b += vectorlength;
419 if (i_b == st_b) break;
420 v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
421 }
422 }
423 }
424 // intersect the tail using scalar intersection
425 while (i_a < s_a && i_b < s_b) {
426 uint16_t a = A[i_a];
427 uint16_t b = B[i_b];
428 if (a < b) {
429 i_a++;
430 } else if (b < a) {
431 i_b++;
432 } else {
433 C[count] = a; //==b;
434 count++;
435 i_a++;
436 i_b++;
437 }
438 }
439 return (int32_t)count;
440}
441
442int32_t intersect_vector16_cardinality(const uint16_t *__restrict__ A,
443 size_t s_a,
444 const uint16_t *__restrict__ B,
445 size_t s_b) {
446 size_t count = 0;
447 size_t i_a = 0, i_b = 0;
448 const int vectorlength = sizeof(__m128i) / sizeof(uint16_t);
449 const size_t st_a = (s_a / vectorlength) * vectorlength;
450 const size_t st_b = (s_b / vectorlength) * vectorlength;
451 __m128i v_a, v_b;
452 if ((i_a < st_a) && (i_b < st_b)) {
453 v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
454 v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
455 while ((A[i_a] == 0) || (B[i_b] == 0)) {
456 const __m128i res_v = _mm_cmpestrm(
457 v_b, vectorlength, v_a, vectorlength,
458 _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
459 const int r = _mm_extract_epi32(res_v, 0);
460 count += _mm_popcnt_u32(r);
461 const uint16_t a_max = A[i_a + vectorlength - 1];
462 const uint16_t b_max = B[i_b + vectorlength - 1];
463 if (a_max <= b_max) {
464 i_a += vectorlength;
465 if (i_a == st_a) break;
466 v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
467 }
468 if (b_max <= a_max) {
469 i_b += vectorlength;
470 if (i_b == st_b) break;
471 v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
472 }
473 }
474 if ((i_a < st_a) && (i_b < st_b))
475 while (true) {
476 const __m128i res_v = _mm_cmpistrm(
477 v_b, v_a,
478 _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
479 const int r = _mm_extract_epi32(res_v, 0);
480 count += _mm_popcnt_u32(r);
481 const uint16_t a_max = A[i_a + vectorlength - 1];
482 const uint16_t b_max = B[i_b + vectorlength - 1];
483 if (a_max <= b_max) {
484 i_a += vectorlength;
485 if (i_a == st_a) break;
486 v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
487 }
488 if (b_max <= a_max) {
489 i_b += vectorlength;
490 if (i_b == st_b) break;
491 v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
492 }
493 }
494 }
495 // intersect the tail using scalar intersection
496 while (i_a < s_a && i_b < s_b) {
497 uint16_t a = A[i_a];
498 uint16_t b = B[i_b];
499 if (a < b) {
500 i_a++;
501 } else if (b < a) {
502 i_b++;
503 } else {
504 count++;
505 i_a++;
506 i_b++;
507 }
508 }
509 return (int32_t)count;
510}
511
512/////////
513// Warning:
514// This function may not be safe if A == C or B == C.
515/////////
516int32_t difference_vector16(const uint16_t *__restrict__ A, size_t s_a,
517 const uint16_t *__restrict__ B, size_t s_b,
518 uint16_t *C) {
519 // we handle the degenerate case
520 if (s_a == 0) return 0;
521 if (s_b == 0) {
522 if (A != C) memcpy(C, A, sizeof(uint16_t) * s_a);
523 return (int32_t)s_a;
524 }
525 // handle the leading zeroes, it is messy but it allows us to use the fast
526 // _mm_cmpistrm instrinsic safely
527 int32_t count = 0;
528 if ((A[0] == 0) || (B[0] == 0)) {
529 if ((A[0] == 0) && (B[0] == 0)) {
530 A++;
531 s_a--;
532 B++;
533 s_b--;
534 } else if (A[0] == 0) {
535 C[count++] = 0;
536 A++;
537 s_a--;
538 } else {
539 B++;
540 s_b--;
541 }
542 }
543 // at this point, we have two non-empty arrays, made of non-zero
544 // increasing values.
545 size_t i_a = 0, i_b = 0;
546 const size_t vectorlength = sizeof(__m128i) / sizeof(uint16_t);
547 const size_t st_a = (s_a / vectorlength) * vectorlength;
548 const size_t st_b = (s_b / vectorlength) * vectorlength;
549 if ((i_a < st_a) && (i_b < st_b)) { // this is the vectorized code path
550 __m128i v_a, v_b; //, v_bmax;
551 // we load a vector from A and a vector from B
552 v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
553 v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
554 // we have a runningmask which indicates which values from A have been
555 // spotted in B, these don't get written out.
556 __m128i runningmask_a_found_in_b = _mm_setzero_si128();
557 /****
558 * start of the main vectorized loop
559 *****/
560 while (true) {
561 // afoundinb will contain a mask indicate for each entry in A
562 // whether it is seen
563 // in B
564 const __m128i a_found_in_b =
565 _mm_cmpistrm(v_b, v_a, _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY |
566 _SIDD_BIT_MASK);
567 runningmask_a_found_in_b =
568 _mm_or_si128(runningmask_a_found_in_b, a_found_in_b);
569 // we always compare the last values of A and B
570 const uint16_t a_max = A[i_a + vectorlength - 1];
571 const uint16_t b_max = B[i_b + vectorlength - 1];
572 if (a_max <= b_max) {
573 // Ok. In this code path, we are ready to write our v_a
574 // because there is no need to read more from B, they will
575 // all be large values.
576 const int bitmask_belongs_to_difference =
577 _mm_extract_epi32(runningmask_a_found_in_b, 0) ^ 0xFF;
578 /*** next few lines are probably expensive *****/
579 __m128i sm16 = _mm_load_si128((const __m128i *)shuffle_mask16 +
580 bitmask_belongs_to_difference);
581 __m128i p = _mm_shuffle_epi8(v_a, sm16);
582 _mm_storeu_si128((__m128i *)&C[count], p); // can overflow
583 count += _mm_popcnt_u32(bitmask_belongs_to_difference);
584 // we advance a
585 i_a += vectorlength;
586 if (i_a == st_a) // no more
587 break;
588 runningmask_a_found_in_b = _mm_setzero_si128();
589 v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
590 }
591 if (b_max <= a_max) {
592 // in this code path, the current v_b has become useless
593 i_b += vectorlength;
594 if (i_b == st_b) break;
595 v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
596 }
597 }
598 // at this point, either we have i_a == st_a, which is the end of the
599 // vectorized processing,
600 // or we have i_b == st_b, and we are not done processing the vector...
601 // so we need to finish it off.
602 if (i_a < st_a) { // we have unfinished business...
603 uint16_t buffer[8]; // buffer to do a masked load
604 memset(buffer, 0, 8 * sizeof(uint16_t));
605 memcpy(buffer, B + i_b, (s_b - i_b) * sizeof(uint16_t));
606 v_b = _mm_lddqu_si128((__m128i *)buffer);
607 const __m128i a_found_in_b =
608 _mm_cmpistrm(v_b, v_a, _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY |
609 _SIDD_BIT_MASK);
610 runningmask_a_found_in_b =
611 _mm_or_si128(runningmask_a_found_in_b, a_found_in_b);
612 const int bitmask_belongs_to_difference =
613 _mm_extract_epi32(runningmask_a_found_in_b, 0) ^ 0xFF;
614 __m128i sm16 = _mm_load_si128((const __m128i *)shuffle_mask16 +
615 bitmask_belongs_to_difference);
616 __m128i p = _mm_shuffle_epi8(v_a, sm16);
617 _mm_storeu_si128((__m128i *)&C[count], p); // can overflow
618 count += _mm_popcnt_u32(bitmask_belongs_to_difference);
619 i_a += vectorlength;
620 }
621 // at this point we should have i_a == st_a and i_b == st_b
622 }
623 // do the tail using scalar code
624 while (i_a < s_a && i_b < s_b) {
625 uint16_t a = A[i_a];
626 uint16_t b = B[i_b];
627 if (b < a) {
628 i_b++;
629 } else if (a < b) {
630 C[count] = a;
631 count++;
632 i_a++;
633 } else { //==
634 i_a++;
635 i_b++;
636 }
637 }
638 if (i_a < s_a) {
639 if(C == A) {
640 assert(count <= i_a);
641 if(count < i_a) {
642 memmove(C + count, A + i_a, sizeof(uint16_t) * (s_a - i_a));
643 }
644 } else {
645 for(size_t i = 0; i < (s_a - i_a); i++) {
646 C[count + i] = A[i + i_a];
647 }
648 }
649 count += (int32_t)(s_a - i_a);
650 }
651 return count;
652}
653
654#endif // USESSE4
655
656
657
658#ifdef USE_OLD_SKEW_INTERSECT
659// TODO: given enough experience with the new skew intersect, drop the old one from the code base.
660
661
662/* Computes the intersection between one small and one large set of uint16_t.
663 * Stores the result into buffer and return the number of elements. */
664int32_t intersect_skewed_uint16(const uint16_t *small, size_t size_s,
665 const uint16_t *large, size_t size_l,
666 uint16_t *buffer) {
667 size_t pos = 0, idx_l = 0, idx_s = 0;
668
669 if (0 == size_s) {
670 return 0;
671 }
672
673 uint16_t val_l = large[idx_l], val_s = small[idx_s];
674
675 while (true) {
676 if (val_l < val_s) {
677 idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
678 if (idx_l == size_l) break;
679 val_l = large[idx_l];
680 } else if (val_s < val_l) {
681 idx_s++;
682 if (idx_s == size_s) break;
683 val_s = small[idx_s];
684 } else {
685 buffer[pos++] = val_s;
686 idx_s++;
687 if (idx_s == size_s) break;
688 val_s = small[idx_s];
689 idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
690 if (idx_l == size_l) break;
691 val_l = large[idx_l];
692 }
693 }
694
695 return (int32_t)pos;
696}
697#else // USE_OLD_SKEW_INTERSECT
698
699
700/**
701* Branchless binary search going after 4 values at once.
702* Assumes that array is sorted.
703* You have that array[*index1] >= target1, array[*index12] >= target2, ...
704* except when *index1 = n, in which case you know that all values in array are
705* smaller than target1, and so forth.
706* It has logarithmic complexity.
707*/
708static void binarySearch4(const uint16_t *array, int32_t n, uint16_t target1,
709 uint16_t target2, uint16_t target3, uint16_t target4,
710 int32_t *index1, int32_t *index2, int32_t *index3,
711 int32_t *index4) {
712 const uint16_t *base1 = array;
713 const uint16_t *base2 = array;
714 const uint16_t *base3 = array;
715 const uint16_t *base4 = array;
716 if (n == 0)
717 return;
718 while (n > 1) {
719 int32_t half = n >> 1;
720 base1 = (base1[half] < target1) ? &base1[half] : base1;
721 base2 = (base2[half] < target2) ? &base2[half] : base2;
722 base3 = (base3[half] < target3) ? &base3[half] : base3;
723 base4 = (base4[half] < target4) ? &base4[half] : base4;
724 n -= half;
725 }
726 *index1 = (int32_t)((*base1 < target1) + base1 - array);
727 *index2 = (int32_t)((*base2 < target2) + base2 - array);
728 *index3 = (int32_t)((*base3 < target3) + base3 - array);
729 *index4 = (int32_t)((*base4 < target4) + base4 - array);
730}
731
732/**
733* Branchless binary search going after 2 values at once.
734* Assumes that array is sorted.
735* You have that array[*index1] >= target1, array[*index12] >= target2.
736* except when *index1 = n, in which case you know that all values in array are
737* smaller than target1, and so forth.
738* It has logarithmic complexity.
739*/
740static void binarySearch2(const uint16_t *array, int32_t n, uint16_t target1,
741 uint16_t target2, int32_t *index1, int32_t *index2) {
742 const uint16_t *base1 = array;
743 const uint16_t *base2 = array;
744 if (n == 0)
745 return;
746 while (n > 1) {
747 int32_t half = n >> 1;
748 base1 = (base1[half] < target1) ? &base1[half] : base1;
749 base2 = (base2[half] < target2) ? &base2[half] : base2;
750 n -= half;
751 }
752 *index1 = (int32_t)((*base1 < target1) + base1 - array);
753 *index2 = (int32_t)((*base2 < target2) + base2 - array);
754}
755
756/* Computes the intersection between one small and one large set of uint16_t.
757 * Stores the result into buffer and return the number of elements.
758 * Processes the small set in blocks of 4 values calling binarySearch4
759 * and binarySearch2. This approach can be slightly superior to a conventional
760 * galloping search in some instances.
761 */
762int32_t intersect_skewed_uint16(const uint16_t *small, size_t size_s,
763 const uint16_t *large, size_t size_l,
764 uint16_t *buffer) {
765 size_t pos = 0, idx_l = 0, idx_s = 0;
766
767 if (0 == size_s) {
768 return 0;
769 }
770 int32_t index1 = 0, index2 = 0, index3 = 0, index4 = 0;
771 while ((idx_s + 4 <= size_s) && (idx_l < size_l)) {
772 uint16_t target1 = small[idx_s];
773 uint16_t target2 = small[idx_s + 1];
774 uint16_t target3 = small[idx_s + 2];
775 uint16_t target4 = small[idx_s + 3];
776 binarySearch4(large + idx_l, (int32_t)(size_l - idx_l), target1, target2, target3,
777 target4, &index1, &index2, &index3, &index4);
778 if ((index1 + idx_l < size_l) && (large[idx_l + index1] == target1)) {
779 buffer[pos++] = target1;
780 }
781 if ((index2 + idx_l < size_l) && (large[idx_l + index2] == target2)) {
782 buffer[pos++] = target2;
783 }
784 if ((index3 + idx_l < size_l) && (large[idx_l + index3] == target3)) {
785 buffer[pos++] = target3;
786 }
787 if ((index4 + idx_l < size_l) && (large[idx_l + index4] == target4)) {
788 buffer[pos++] = target4;
789 }
790 idx_s += 4;
791 idx_l += index4;
792 }
793 if ((idx_s + 2 <= size_s) && (idx_l < size_l)) {
794 uint16_t target1 = small[idx_s];
795 uint16_t target2 = small[idx_s + 1];
796 binarySearch2(large + idx_l, (int32_t)(size_l - idx_l), target1, target2, &index1,
797 &index2);
798 if ((index1 + idx_l < size_l) && (large[idx_l + index1] == target1)) {
799 buffer[pos++] = target1;
800 }
801 if ((index2 + idx_l < size_l) && (large[idx_l + index2] == target2)) {
802 buffer[pos++] = target2;
803 }
804 idx_s += 2;
805 idx_l += index2;
806 }
807 if ((idx_s < size_s) && (idx_l < size_l)) {
808 uint16_t val_s = small[idx_s];
809 int32_t index = binarySearch(large + idx_l, (int32_t)(size_l - idx_l), val_s);
810 if (index >= 0)
811 buffer[pos++] = val_s;
812 }
813 return (int32_t)pos;
814}
815
816
817#endif //USE_OLD_SKEW_INTERSECT
818
819
820// TODO: this could be accelerated, possibly, by using binarySearch4 as above.
821int32_t intersect_skewed_uint16_cardinality(const uint16_t *small,
822 size_t size_s,
823 const uint16_t *large,
824 size_t size_l) {
825 size_t pos = 0, idx_l = 0, idx_s = 0;
826
827 if (0 == size_s) {
828 return 0;
829 }
830
831 uint16_t val_l = large[idx_l], val_s = small[idx_s];
832
833 while (true) {
834 if (val_l < val_s) {
835 idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
836 if (idx_l == size_l) break;
837 val_l = large[idx_l];
838 } else if (val_s < val_l) {
839 idx_s++;
840 if (idx_s == size_s) break;
841 val_s = small[idx_s];
842 } else {
843 pos++;
844 idx_s++;
845 if (idx_s == size_s) break;
846 val_s = small[idx_s];
847 idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
848 if (idx_l == size_l) break;
849 val_l = large[idx_l];
850 }
851 }
852
853 return (int32_t)pos;
854}
855
856bool intersect_skewed_uint16_nonempty(const uint16_t *small, size_t size_s,
857 const uint16_t *large, size_t size_l) {
858 size_t idx_l = 0, idx_s = 0;
859
860 if (0 == size_s) {
861 return false;
862 }
863
864 uint16_t val_l = large[idx_l], val_s = small[idx_s];
865
866 while (true) {
867 if (val_l < val_s) {
868 idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
869 if (idx_l == size_l) break;
870 val_l = large[idx_l];
871 } else if (val_s < val_l) {
872 idx_s++;
873 if (idx_s == size_s) break;
874 val_s = small[idx_s];
875 } else {
876 return true;
877 }
878 }
879
880 return false;
881}
882
883/**
884 * Generic intersection function.
885 */
886int32_t intersect_uint16(const uint16_t *A, const size_t lenA,
887 const uint16_t *B, const size_t lenB, uint16_t *out) {
888 const uint16_t *initout = out;
889 if (lenA == 0 || lenB == 0) return 0;
890 const uint16_t *endA = A + lenA;
891 const uint16_t *endB = B + lenB;
892
893 while (1) {
894 while (*A < *B) {
895 SKIP_FIRST_COMPARE:
896 if (++A == endA) return (int32_t)(out - initout);
897 }
898 while (*A > *B) {
899 if (++B == endB) return (int32_t)(out - initout);
900 }
901 if (*A == *B) {
902 *out++ = *A;
903 if (++A == endA || ++B == endB) return (int32_t)(out - initout);
904 } else {
905 goto SKIP_FIRST_COMPARE;
906 }
907 }
908 return (int32_t)(out - initout); // NOTREACHED
909}
910
911int32_t intersect_uint16_cardinality(const uint16_t *A, const size_t lenA,
912 const uint16_t *B, const size_t lenB) {
913 int32_t answer = 0;
914 if (lenA == 0 || lenB == 0) return 0;
915 const uint16_t *endA = A + lenA;
916 const uint16_t *endB = B + lenB;
917
918 while (1) {
919 while (*A < *B) {
920 SKIP_FIRST_COMPARE:
921 if (++A == endA) return answer;
922 }
923 while (*A > *B) {
924 if (++B == endB) return answer;
925 }
926 if (*A == *B) {
927 ++answer;
928 if (++A == endA || ++B == endB) return answer;
929 } else {
930 goto SKIP_FIRST_COMPARE;
931 }
932 }
933 return answer; // NOTREACHED
934}
935
936
937bool intersect_uint16_nonempty(const uint16_t *A, const size_t lenA,
938 const uint16_t *B, const size_t lenB) {
939 if (lenA == 0 || lenB == 0) return 0;
940 const uint16_t *endA = A + lenA;
941 const uint16_t *endB = B + lenB;
942
943 while (1) {
944 while (*A < *B) {
945 SKIP_FIRST_COMPARE:
946 if (++A == endA) return false;
947 }
948 while (*A > *B) {
949 if (++B == endB) return false;
950 }
951 if (*A == *B) {
952 return true;
953 } else {
954 goto SKIP_FIRST_COMPARE;
955 }
956 }
957 return false; // NOTREACHED
958}
959
960
961
962/**
963 * Generic intersection function.
964 */
965size_t intersection_uint32(const uint32_t *A, const size_t lenA,
966 const uint32_t *B, const size_t lenB,
967 uint32_t *out) {
968 const uint32_t *initout = out;
969 if (lenA == 0 || lenB == 0) return 0;
970 const uint32_t *endA = A + lenA;
971 const uint32_t *endB = B + lenB;
972
973 while (1) {
974 while (*A < *B) {
975 SKIP_FIRST_COMPARE:
976 if (++A == endA) return (out - initout);
977 }
978 while (*A > *B) {
979 if (++B == endB) return (out - initout);
980 }
981 if (*A == *B) {
982 *out++ = *A;
983 if (++A == endA || ++B == endB) return (out - initout);
984 } else {
985 goto SKIP_FIRST_COMPARE;
986 }
987 }
988 return (out - initout); // NOTREACHED
989}
990
991size_t intersection_uint32_card(const uint32_t *A, const size_t lenA,
992 const uint32_t *B, const size_t lenB) {
993 if (lenA == 0 || lenB == 0) return 0;
994 size_t card = 0;
995 const uint32_t *endA = A + lenA;
996 const uint32_t *endB = B + lenB;
997
998 while (1) {
999 while (*A < *B) {
1000 SKIP_FIRST_COMPARE:
1001 if (++A == endA) return card;
1002 }
1003 while (*A > *B) {
1004 if (++B == endB) return card;
1005 }
1006 if (*A == *B) {
1007 card++;
1008 if (++A == endA || ++B == endB) return card;
1009 } else {
1010 goto SKIP_FIRST_COMPARE;
1011 }
1012 }
1013 return card; // NOTREACHED
1014}
1015
1016// can one vectorize the computation of the union? (Update: Yes! See
1017// union_vector16).
1018
1019size_t union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2,
1020 size_t size_2, uint16_t *buffer) {
1021 size_t pos = 0, idx_1 = 0, idx_2 = 0;
1022
1023 if (0 == size_2) {
1024 memmove(buffer, set_1, size_1 * sizeof(uint16_t));
1025 return size_1;
1026 }
1027 if (0 == size_1) {
1028 memmove(buffer, set_2, size_2 * sizeof(uint16_t));
1029 return size_2;
1030 }
1031
1032 uint16_t val_1 = set_1[idx_1], val_2 = set_2[idx_2];
1033
1034 while (true) {
1035 if (val_1 < val_2) {
1036 buffer[pos++] = val_1;
1037 ++idx_1;
1038 if (idx_1 >= size_1) break;
1039 val_1 = set_1[idx_1];
1040 } else if (val_2 < val_1) {
1041 buffer[pos++] = val_2;
1042 ++idx_2;
1043 if (idx_2 >= size_2) break;
1044 val_2 = set_2[idx_2];
1045 } else {
1046 buffer[pos++] = val_1;
1047 ++idx_1;
1048 ++idx_2;
1049 if (idx_1 >= size_1 || idx_2 >= size_2) break;
1050 val_1 = set_1[idx_1];
1051 val_2 = set_2[idx_2];
1052 }
1053 }
1054
1055 if (idx_1 < size_1) {
1056 const size_t n_elems = size_1 - idx_1;
1057 memmove(buffer + pos, set_1 + idx_1, n_elems * sizeof(uint16_t));
1058 pos += n_elems;
1059 } else if (idx_2 < size_2) {
1060 const size_t n_elems = size_2 - idx_2;
1061 memmove(buffer + pos, set_2 + idx_2, n_elems * sizeof(uint16_t));
1062 pos += n_elems;
1063 }
1064
1065 return pos;
1066}
1067
1068int difference_uint16(const uint16_t *a1, int length1, const uint16_t *a2,
1069 int length2, uint16_t *a_out) {
1070 int out_card = 0;
1071 int k1 = 0, k2 = 0;
1072 if (length1 == 0) return 0;
1073 if (length2 == 0) {
1074 if (a1 != a_out) memcpy(a_out, a1, sizeof(uint16_t) * length1);
1075 return length1;
1076 }
1077 uint16_t s1 = a1[k1];
1078 uint16_t s2 = a2[k2];
1079 while (true) {
1080 if (s1 < s2) {
1081 a_out[out_card++] = s1;
1082 ++k1;
1083 if (k1 >= length1) {
1084 break;
1085 }
1086 s1 = a1[k1];
1087 } else if (s1 == s2) {
1088 ++k1;
1089 ++k2;
1090 if (k1 >= length1) {
1091 break;
1092 }
1093 if (k2 >= length2) {
1094 memmove(a_out + out_card, a1 + k1,
1095 sizeof(uint16_t) * (length1 - k1));
1096 return out_card + length1 - k1;
1097 }
1098 s1 = a1[k1];
1099 s2 = a2[k2];
1100 } else { // if (val1>val2)
1101 ++k2;
1102 if (k2 >= length2) {
1103 memmove(a_out + out_card, a1 + k1,
1104 sizeof(uint16_t) * (length1 - k1));
1105 return out_card + length1 - k1;
1106 }
1107 s2 = a2[k2];
1108 }
1109 }
1110 return out_card;
1111}
1112
1113int32_t xor_uint16(const uint16_t *array_1, int32_t card_1,
1114 const uint16_t *array_2, int32_t card_2, uint16_t *out) {
1115 int32_t pos1 = 0, pos2 = 0, pos_out = 0;
1116 while (pos1 < card_1 && pos2 < card_2) {
1117 const uint16_t v1 = array_1[pos1];
1118 const uint16_t v2 = array_2[pos2];
1119 if (v1 == v2) {
1120 ++pos1;
1121 ++pos2;
1122 continue;
1123 }
1124 if (v1 < v2) {
1125 out[pos_out++] = v1;
1126 ++pos1;
1127 } else {
1128 out[pos_out++] = v2;
1129 ++pos2;
1130 }
1131 }
1132 if (pos1 < card_1) {
1133 const size_t n_elems = card_1 - pos1;
1134 memcpy(out + pos_out, array_1 + pos1, n_elems * sizeof(uint16_t));
1135 pos_out += (int32_t)n_elems;
1136 } else if (pos2 < card_2) {
1137 const size_t n_elems = card_2 - pos2;
1138 memcpy(out + pos_out, array_2 + pos2, n_elems * sizeof(uint16_t));
1139 pos_out += (int32_t)n_elems;
1140 }
1141 return pos_out;
1142}
1143
1144#ifdef USESSE4
1145
1146/***
1147 * start of the SIMD 16-bit union code
1148 *
1149 */
1150
1151// Assuming that vInput1 and vInput2 are sorted, produces a sorted output going
1152// from vecMin all the way to vecMax
1153// developed originally for merge sort using SIMD instructions.
1154// Standard merge. See, e.g., Inoue and Taura, SIMD- and Cache-Friendly
1155// Algorithm for Sorting an Array of Structures
1156static inline void sse_merge(const __m128i *vInput1,
1157 const __m128i *vInput2, // input 1 & 2
1158 __m128i *vecMin, __m128i *vecMax) { // output
1159 __m128i vecTmp;
1160 vecTmp = _mm_min_epu16(*vInput1, *vInput2);
1161 *vecMax = _mm_max_epu16(*vInput1, *vInput2);
1162 vecTmp = _mm_alignr_epi8(vecTmp, vecTmp, 2);
1163 *vecMin = _mm_min_epu16(vecTmp, *vecMax);
1164 *vecMax = _mm_max_epu16(vecTmp, *vecMax);
1165 vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
1166 *vecMin = _mm_min_epu16(vecTmp, *vecMax);
1167 *vecMax = _mm_max_epu16(vecTmp, *vecMax);
1168 vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
1169 *vecMin = _mm_min_epu16(vecTmp, *vecMax);
1170 *vecMax = _mm_max_epu16(vecTmp, *vecMax);
1171 vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
1172 *vecMin = _mm_min_epu16(vecTmp, *vecMax);
1173 *vecMax = _mm_max_epu16(vecTmp, *vecMax);
1174 vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
1175 *vecMin = _mm_min_epu16(vecTmp, *vecMax);
1176 *vecMax = _mm_max_epu16(vecTmp, *vecMax);
1177 vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
1178 *vecMin = _mm_min_epu16(vecTmp, *vecMax);
1179 *vecMax = _mm_max_epu16(vecTmp, *vecMax);
1180 vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
1181 *vecMin = _mm_min_epu16(vecTmp, *vecMax);
1182 *vecMax = _mm_max_epu16(vecTmp, *vecMax);
1183 *vecMin = _mm_alignr_epi8(*vecMin, *vecMin, 2);
1184}
1185
1186// used by store_unique, generated by simdunion.py
1187static uint8_t uniqshuf[] = {
1188 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb,
1189 0xc, 0xd, 0xe, 0xf, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9,
1190 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5,
1191 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF,
1192 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf,
1193 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9,
1194 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7,
1195 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1196 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf,
1197 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd,
1198 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1199 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF,
1200 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf,
1201 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0x8, 0x9, 0xa, 0xb,
1202 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x8, 0x9,
1203 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1204 0x0, 0x1, 0x2, 0x3, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf,
1205 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd,
1206 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x8, 0x9,
1207 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1208 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1209 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1210 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5,
1211 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1212 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf,
1213 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, 0xc, 0xd,
1214 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1215 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1216 0x2, 0x3, 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF,
1217 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7, 0xa, 0xb, 0xc, 0xd,
1218 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0xa, 0xb,
1219 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1220 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf,
1221 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0xa, 0xb, 0xc, 0xd,
1222 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5,
1223 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1224 0x4, 0x5, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1225 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0xa, 0xb, 0xc, 0xd,
1226 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0xa, 0xb,
1227 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1228 0x0, 0x1, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1229 0xFF, 0xFF, 0xFF, 0xFF, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF,
1230 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1231 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF,
1232 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, 0xe, 0xf,
1233 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9,
1234 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7,
1235 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1236 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, 0xe, 0xf,
1237 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, 0xc, 0xd,
1238 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7,
1239 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1240 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1241 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9,
1242 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5,
1243 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1244 0x0, 0x1, 0x4, 0x5, 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF,
1245 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x8, 0x9, 0xc, 0xd, 0xe, 0xf,
1246 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1247 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1248 0x2, 0x3, 0x8, 0x9, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1249 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x8, 0x9, 0xc, 0xd, 0xe, 0xf,
1250 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8, 0x9, 0xc, 0xd,
1251 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1252 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xc, 0xd, 0xe, 0xf,
1253 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xc, 0xd,
1254 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5,
1255 0x6, 0x7, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1256 0x4, 0x5, 0x6, 0x7, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1257 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0xc, 0xd,
1258 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7,
1259 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1260 0x0, 0x1, 0x6, 0x7, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1261 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF,
1262 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1263 0x4, 0x5, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1264 0x2, 0x3, 0x4, 0x5, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1265 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0xc, 0xd, 0xe, 0xf,
1266 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0xc, 0xd,
1267 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1268 0x0, 0x1, 0x2, 0x3, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1269 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF,
1270 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0xc, 0xd,
1271 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1272 0xc, 0xd, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1273 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1274 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5,
1275 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1276 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xe, 0xf,
1277 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb,
1278 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1279 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1280 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF,
1281 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb,
1282 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0x8, 0x9,
1283 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1284 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, 0xe, 0xf,
1285 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xa, 0xb,
1286 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5,
1287 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1288 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1289 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x8, 0x9, 0xa, 0xb,
1290 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x8, 0x9,
1291 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1292 0x0, 0x1, 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1293 0xFF, 0xFF, 0xFF, 0xFF, 0x8, 0x9, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF,
1294 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1295 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1296 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF,
1297 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0xa, 0xb,
1298 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7,
1299 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1300 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF,
1301 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7, 0xa, 0xb, 0xe, 0xf,
1302 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7,
1303 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1304 0x6, 0x7, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1305 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xa, 0xb,
1306 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5,
1307 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1308 0x0, 0x1, 0x4, 0x5, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1309 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF,
1310 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1311 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1312 0x2, 0x3, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1313 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0xa, 0xb, 0xe, 0xf, 0xFF, 0xFF,
1314 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xa, 0xb, 0xe, 0xf,
1315 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1316 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xe, 0xf,
1317 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9,
1318 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5,
1319 0x6, 0x7, 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1320 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1321 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9,
1322 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7,
1323 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1324 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1325 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF,
1326 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1327 0x4, 0x5, 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1328 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1329 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0x8, 0x9, 0xe, 0xf,
1330 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x8, 0x9,
1331 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1332 0x0, 0x1, 0x2, 0x3, 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1333 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF,
1334 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x8, 0x9,
1335 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1336 0x8, 0x9, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1337 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1338 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5,
1339 0x6, 0x7, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1340 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1341 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7, 0xe, 0xf, 0xFF, 0xFF,
1342 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1343 0x6, 0x7, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1344 0x2, 0x3, 0x6, 0x7, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1345 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7, 0xe, 0xf, 0xFF, 0xFF,
1346 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0xe, 0xf,
1347 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1348 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF,
1349 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0xe, 0xf, 0xFF, 0xFF,
1350 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5,
1351 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1352 0x4, 0x5, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1353 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0xe, 0xf, 0xFF, 0xFF,
1354 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0xe, 0xf,
1355 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1356 0x0, 0x1, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1357 0xFF, 0xFF, 0xFF, 0xFF, 0xe, 0xf, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1358 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1359 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF,
1360 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd,
1361 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9,
1362 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7,
1363 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1364 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd,
1365 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb,
1366 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7,
1367 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1368 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF,
1369 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9,
1370 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5,
1371 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1372 0x0, 0x1, 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF,
1373 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd,
1374 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1375 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1376 0x2, 0x3, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF,
1377 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd,
1378 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8, 0x9, 0xa, 0xb,
1379 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1380 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, 0xc, 0xd,
1381 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xa, 0xb,
1382 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5,
1383 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1384 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF,
1385 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0xa, 0xb,
1386 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7,
1387 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1388 0x0, 0x1, 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF,
1389 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF,
1390 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1391 0x4, 0x5, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1392 0x2, 0x3, 0x4, 0x5, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF,
1393 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0xa, 0xb, 0xc, 0xd,
1394 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0xa, 0xb,
1395 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1396 0x0, 0x1, 0x2, 0x3, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF,
1397 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF,
1398 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0xa, 0xb,
1399 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1400 0xa, 0xb, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1401 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1402 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5,
1403 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1404 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF,
1405 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xc, 0xd,
1406 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1407 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1408 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF,
1409 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xc, 0xd,
1410 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0x8, 0x9,
1411 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1412 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF,
1413 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xc, 0xd,
1414 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5,
1415 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1416 0x4, 0x5, 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1417 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x8, 0x9, 0xc, 0xd,
1418 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x8, 0x9,
1419 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1420 0x0, 0x1, 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1421 0xFF, 0xFF, 0xFF, 0xFF, 0x8, 0x9, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF,
1422 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1423 0x4, 0x5, 0x6, 0x7, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1424 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF,
1425 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0xc, 0xd,
1426 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7,
1427 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1428 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF,
1429 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7, 0xc, 0xd, 0xFF, 0xFF,
1430 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7,
1431 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1432 0x6, 0x7, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1433 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xc, 0xd,
1434 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5,
1435 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1436 0x0, 0x1, 0x4, 0x5, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1437 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF,
1438 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1439 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1440 0x2, 0x3, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1441 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF,
1442 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xc, 0xd, 0xFF, 0xFF,
1443 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1444 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb,
1445 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9,
1446 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5,
1447 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1448 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF,
1449 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9,
1450 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7,
1451 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1452 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF,
1453 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF,
1454 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1455 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1456 0x2, 0x3, 0x4, 0x5, 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF,
1457 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0x8, 0x9, 0xa, 0xb,
1458 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x8, 0x9,
1459 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1460 0x0, 0x1, 0x2, 0x3, 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF,
1461 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF,
1462 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x8, 0x9,
1463 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1464 0x8, 0x9, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1465 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1466 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5,
1467 0x6, 0x7, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1468 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF,
1469 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7, 0xa, 0xb, 0xFF, 0xFF,
1470 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1471 0x6, 0x7, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1472 0x2, 0x3, 0x6, 0x7, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1473 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7, 0xa, 0xb, 0xFF, 0xFF,
1474 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0xa, 0xb,
1475 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1476 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF,
1477 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0xa, 0xb, 0xFF, 0xFF,
1478 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5,
1479 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1480 0x4, 0x5, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1481 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0xa, 0xb, 0xFF, 0xFF,
1482 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0xa, 0xb,
1483 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1484 0x0, 0x1, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1485 0xFF, 0xFF, 0xFF, 0xFF, 0xa, 0xb, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1486 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1487 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1488 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
1489 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9,
1490 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x6, 0x7,
1491 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1492 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
1493 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7, 0x8, 0x9, 0xFF, 0xFF,
1494 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x6, 0x7,
1495 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1496 0x6, 0x7, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1497 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x8, 0x9,
1498 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5,
1499 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1500 0x0, 0x1, 0x4, 0x5, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1501 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
1502 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1503 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1504 0x2, 0x3, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1505 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
1506 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8, 0x9, 0xFF, 0xFF,
1507 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1508 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xFF, 0xFF, 0xFF, 0xFF,
1509 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xFF, 0xFF,
1510 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5,
1511 0x6, 0x7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1512 0x4, 0x5, 0x6, 0x7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1513 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3, 0x6, 0x7, 0xFF, 0xFF,
1514 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0x6, 0x7,
1515 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1516 0x0, 0x1, 0x6, 0x7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1517 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0x7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1518 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x2, 0x3,
1519 0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1520 0x2, 0x3, 0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1521 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF,
1522 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4, 0x5, 0xFF, 0xFF,
1523 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1524 0x0, 0x1, 0x2, 0x3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1525 0xFF, 0xFF, 0xFF, 0xFF, 0x2, 0x3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1526 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x1, 0xFF, 0xFF,
1527 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1528 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1529 0xFF, 0xFF, 0xFF, 0xFF};
1530
1531// write vector new, while omitting repeated values assuming that previously
1532// written vector was "old"
1533static inline int store_unique(__m128i old, __m128i newval, uint16_t *output) {
1534 __m128i vecTmp = _mm_alignr_epi8(newval, old, 16 - 2);
1535 // lots of high latency instructions follow (optimize?)
1536 int M = _mm_movemask_epi8(
1537 _mm_packs_epi16(_mm_cmpeq_epi16(vecTmp, newval), _mm_setzero_si128()));
1538 int numberofnewvalues = 8 - _mm_popcnt_u32(M);
1539 __m128i key = _mm_lddqu_si128((const __m128i *)uniqshuf + M);
1540 __m128i val = _mm_shuffle_epi8(newval, key);
1541 _mm_storeu_si128((__m128i *)output, val);
1542 return numberofnewvalues;
1543}
1544
1545// working in-place, this function overwrites the repeated values
1546// could be avoided?
1547static inline uint32_t unique(uint16_t *out, uint32_t len) {
1548 uint32_t pos = 1;
1549 for (uint32_t i = 1; i < len; ++i) {
1550 if (out[i] != out[i - 1]) {
1551 out[pos++] = out[i];
1552 }
1553 }
1554 return pos;
1555}
1556
1557// use with qsort, could be avoided
1558static int uint16_compare(const void *a, const void *b) {
1559 return (*(uint16_t *)a - *(uint16_t *)b);
1560}
1561
1562// a one-pass SSE union algorithm
1563// This function may not be safe if array1 == output or array2 == output.
1564uint32_t union_vector16(const uint16_t *__restrict__ array1, uint32_t length1,
1565 const uint16_t *__restrict__ array2, uint32_t length2,
1566 uint16_t *__restrict__ output) {
1567 if ((length1 < 8) || (length2 < 8)) {
1568 return (uint32_t)union_uint16(array1, length1, array2, length2, output);
1569 }
1570 __m128i vA, vB, V, vecMin, vecMax;
1571 __m128i laststore;
1572 uint16_t *initoutput = output;
1573 uint32_t len1 = length1 / 8;
1574 uint32_t len2 = length2 / 8;
1575 uint32_t pos1 = 0;
1576 uint32_t pos2 = 0;
1577 // we start the machine
1578 vA = _mm_lddqu_si128((const __m128i *)array1 + pos1);
1579 pos1++;
1580 vB = _mm_lddqu_si128((const __m128i *)array2 + pos2);
1581 pos2++;
1582 sse_merge(&vA, &vB, &vecMin, &vecMax);
1583 laststore = _mm_set1_epi16(-1);
1584 output += store_unique(laststore, vecMin, output);
1585 laststore = vecMin;
1586 if ((pos1 < len1) && (pos2 < len2)) {
1587 uint16_t curA, curB;
1588 curA = array1[8 * pos1];
1589 curB = array2[8 * pos2];
1590 while (true) {
1591 if (curA <= curB) {
1592 V = _mm_lddqu_si128((const __m128i *)array1 + pos1);
1593 pos1++;
1594 if (pos1 < len1) {
1595 curA = array1[8 * pos1];
1596 } else {
1597 break;
1598 }
1599 } else {
1600 V = _mm_lddqu_si128((const __m128i *)array2 + pos2);
1601 pos2++;
1602 if (pos2 < len2) {
1603 curB = array2[8 * pos2];
1604 } else {
1605 break;
1606 }
1607 }
1608 sse_merge(&V, &vecMax, &vecMin, &vecMax);
1609 output += store_unique(laststore, vecMin, output);
1610 laststore = vecMin;
1611 }
1612 sse_merge(&V, &vecMax, &vecMin, &vecMax);
1613 output += store_unique(laststore, vecMin, output);
1614 laststore = vecMin;
1615 }
1616 // we finish the rest off using a scalar algorithm
1617 // could be improved?
1618 //
1619 // copy the small end on a tmp buffer
1620 uint32_t len = (uint32_t)(output - initoutput);
1621 uint16_t buffer[16];
1622 uint32_t leftoversize = store_unique(laststore, vecMax, buffer);
1623 if (pos1 == len1) {
1624 memcpy(buffer + leftoversize, array1 + 8 * pos1,
1625 (length1 - 8 * len1) * sizeof(uint16_t));
1626 leftoversize += length1 - 8 * len1;
1627 qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
1628
1629 leftoversize = unique(buffer, leftoversize);
1630 len += (uint32_t)union_uint16(buffer, leftoversize, array2 + 8 * pos2,
1631 length2 - 8 * pos2, output);
1632 } else {
1633 memcpy(buffer + leftoversize, array2 + 8 * pos2,
1634 (length2 - 8 * len2) * sizeof(uint16_t));
1635 leftoversize += length2 - 8 * len2;
1636 qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
1637 leftoversize = unique(buffer, leftoversize);
1638 len += (uint32_t)union_uint16(buffer, leftoversize, array1 + 8 * pos1,
1639 length1 - 8 * pos1, output);
1640 }
1641 return len;
1642}
1643
1644/**
1645 * End of the SIMD 16-bit union code
1646 *
1647 */
1648
1649/**
1650 * Start of SIMD 16-bit XOR code
1651 */
1652
1653// write vector new, while omitting repeated values assuming that previously
1654// written vector was "old"
1655static inline int store_unique_xor(__m128i old, __m128i newval,
1656 uint16_t *output) {
1657 __m128i vecTmp1 = _mm_alignr_epi8(newval, old, 16 - 4);
1658 __m128i vecTmp2 = _mm_alignr_epi8(newval, old, 16 - 2);
1659 __m128i equalleft = _mm_cmpeq_epi16(vecTmp2, vecTmp1);
1660 __m128i equalright = _mm_cmpeq_epi16(vecTmp2, newval);
1661 __m128i equalleftoright = _mm_or_si128(equalleft, equalright);
1662 int M = _mm_movemask_epi8(
1663 _mm_packs_epi16(equalleftoright, _mm_setzero_si128()));
1664 int numberofnewvalues = 8 - _mm_popcnt_u32(M);
1665 __m128i key = _mm_lddqu_si128((const __m128i *)uniqshuf + M);
1666 __m128i val = _mm_shuffle_epi8(vecTmp2, key);
1667 _mm_storeu_si128((__m128i *)output, val);
1668 return numberofnewvalues;
1669}
1670
1671// working in-place, this function overwrites the repeated values
1672// could be avoided? Warning: assumes len > 0
1673static inline uint32_t unique_xor(uint16_t *out, uint32_t len) {
1674 uint32_t pos = 1;
1675 for (uint32_t i = 1; i < len; ++i) {
1676 if (out[i] != out[i - 1]) {
1677 out[pos++] = out[i];
1678 } else
1679 pos--; // if it is identical to previous, delete it
1680 }
1681 return pos;
1682}
1683
1684// a one-pass SSE xor algorithm
1685uint32_t xor_vector16(const uint16_t *__restrict__ array1, uint32_t length1,
1686 const uint16_t *__restrict__ array2, uint32_t length2,
1687 uint16_t *__restrict__ output) {
1688 if ((length1 < 8) || (length2 < 8)) {
1689 return xor_uint16(array1, length1, array2, length2, output);
1690 }
1691 __m128i vA, vB, V, vecMin, vecMax;
1692 __m128i laststore;
1693 uint16_t *initoutput = output;
1694 uint32_t len1 = length1 / 8;
1695 uint32_t len2 = length2 / 8;
1696 uint32_t pos1 = 0;
1697 uint32_t pos2 = 0;
1698 // we start the machine
1699 vA = _mm_lddqu_si128((const __m128i *)array1 + pos1);
1700 pos1++;
1701 vB = _mm_lddqu_si128((const __m128i *)array2 + pos2);
1702 pos2++;
1703 sse_merge(&vA, &vB, &vecMin, &vecMax);
1704 laststore = _mm_set1_epi16(-1);
1705 uint16_t buffer[17];
1706 output += store_unique_xor(laststore, vecMin, output);
1707
1708 laststore = vecMin;
1709 if ((pos1 < len1) && (pos2 < len2)) {
1710 uint16_t curA, curB;
1711 curA = array1[8 * pos1];
1712 curB = array2[8 * pos2];
1713 while (true) {
1714 if (curA <= curB) {
1715 V = _mm_lddqu_si128((const __m128i *)array1 + pos1);
1716 pos1++;
1717 if (pos1 < len1) {
1718 curA = array1[8 * pos1];
1719 } else {
1720 break;
1721 }
1722 } else {
1723 V = _mm_lddqu_si128((const __m128i *)array2 + pos2);
1724 pos2++;
1725 if (pos2 < len2) {
1726 curB = array2[8 * pos2];
1727 } else {
1728 break;
1729 }
1730 }
1731 sse_merge(&V, &vecMax, &vecMin, &vecMax);
1732 // conditionally stores the last value of laststore as well as all
1733 // but the
1734 // last value of vecMin
1735 output += store_unique_xor(laststore, vecMin, output);
1736 laststore = vecMin;
1737 }
1738 sse_merge(&V, &vecMax, &vecMin, &vecMax);
1739 // conditionally stores the last value of laststore as well as all but
1740 // the
1741 // last value of vecMin
1742 output += store_unique_xor(laststore, vecMin, output);
1743 laststore = vecMin;
1744 }
1745 uint32_t len = (uint32_t)(output - initoutput);
1746
1747 // we finish the rest off using a scalar algorithm
1748 // could be improved?
1749 // conditionally stores the last value of laststore as well as all but the
1750 // last value of vecMax,
1751 // we store to "buffer"
1752 int leftoversize = store_unique_xor(laststore, vecMax, buffer);
1753 uint16_t vec7 = _mm_extract_epi16(vecMax, 7);
1754 uint16_t vec6 = _mm_extract_epi16(vecMax, 6);
1755 if (vec7 != vec6) buffer[leftoversize++] = vec7;
1756 if (pos1 == len1) {
1757 memcpy(buffer + leftoversize, array1 + 8 * pos1,
1758 (length1 - 8 * len1) * sizeof(uint16_t));
1759 leftoversize += length1 - 8 * len1;
1760 if (leftoversize == 0) { // trivial case
1761 memcpy(output, array2 + 8 * pos2,
1762 (length2 - 8 * pos2) * sizeof(uint16_t));
1763 len += (length2 - 8 * pos2);
1764 } else {
1765 qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
1766 leftoversize = unique_xor(buffer, leftoversize);
1767 len += xor_uint16(buffer, leftoversize, array2 + 8 * pos2,
1768 length2 - 8 * pos2, output);
1769 }
1770 } else {
1771 memcpy(buffer + leftoversize, array2 + 8 * pos2,
1772 (length2 - 8 * len2) * sizeof(uint16_t));
1773 leftoversize += length2 - 8 * len2;
1774 if (leftoversize == 0) { // trivial case
1775 memcpy(output, array1 + 8 * pos1,
1776 (length1 - 8 * pos1) * sizeof(uint16_t));
1777 len += (length1 - 8 * pos1);
1778 } else {
1779 qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
1780 leftoversize = unique_xor(buffer, leftoversize);
1781 len += xor_uint16(buffer, leftoversize, array1 + 8 * pos1,
1782 length1 - 8 * pos1, output);
1783 }
1784 }
1785 return len;
1786}
1787
1788/**
1789 * End of SIMD 16-bit XOR code
1790 */
1791
1792#endif // USESSE4
1793
1794size_t union_uint32(const uint32_t *set_1, size_t size_1, const uint32_t *set_2,
1795 size_t size_2, uint32_t *buffer) {
1796 size_t pos = 0, idx_1 = 0, idx_2 = 0;
1797
1798 if (0 == size_2) {
1799 memmove(buffer, set_1, size_1 * sizeof(uint32_t));
1800 return size_1;
1801 }
1802 if (0 == size_1) {
1803 memmove(buffer, set_2, size_2 * sizeof(uint32_t));
1804 return size_2;
1805 }
1806
1807 uint32_t val_1 = set_1[idx_1], val_2 = set_2[idx_2];
1808
1809 while (true) {
1810 if (val_1 < val_2) {
1811 buffer[pos++] = val_1;
1812 ++idx_1;
1813 if (idx_1 >= size_1) break;
1814 val_1 = set_1[idx_1];
1815 } else if (val_2 < val_1) {
1816 buffer[pos++] = val_2;
1817 ++idx_2;
1818 if (idx_2 >= size_2) break;
1819 val_2 = set_2[idx_2];
1820 } else {
1821 buffer[pos++] = val_1;
1822 ++idx_1;
1823 ++idx_2;
1824 if (idx_1 >= size_1 || idx_2 >= size_2) break;
1825 val_1 = set_1[idx_1];
1826 val_2 = set_2[idx_2];
1827 }
1828 }
1829
1830 if (idx_1 < size_1) {
1831 const size_t n_elems = size_1 - idx_1;
1832 memmove(buffer + pos, set_1 + idx_1, n_elems * sizeof(uint32_t));
1833 pos += n_elems;
1834 } else if (idx_2 < size_2) {
1835 const size_t n_elems = size_2 - idx_2;
1836 memmove(buffer + pos, set_2 + idx_2, n_elems * sizeof(uint32_t));
1837 pos += n_elems;
1838 }
1839
1840 return pos;
1841}
1842
1843size_t union_uint32_card(const uint32_t *set_1, size_t size_1,
1844 const uint32_t *set_2, size_t size_2) {
1845 size_t pos = 0, idx_1 = 0, idx_2 = 0;
1846
1847 if (0 == size_2) {
1848 return size_1;
1849 }
1850 if (0 == size_1) {
1851 return size_2;
1852 }
1853
1854 uint32_t val_1 = set_1[idx_1], val_2 = set_2[idx_2];
1855
1856 while (true) {
1857 if (val_1 < val_2) {
1858 ++idx_1;
1859 ++pos;
1860 if (idx_1 >= size_1) break;
1861 val_1 = set_1[idx_1];
1862 } else if (val_2 < val_1) {
1863 ++idx_2;
1864 ++pos;
1865 if (idx_2 >= size_2) break;
1866 val_2 = set_2[idx_2];
1867 } else {
1868 ++idx_1;
1869 ++idx_2;
1870 ++pos;
1871 if (idx_1 >= size_1 || idx_2 >= size_2) break;
1872 val_1 = set_1[idx_1];
1873 val_2 = set_2[idx_2];
1874 }
1875 }
1876
1877 if (idx_1 < size_1) {
1878 const size_t n_elems = size_1 - idx_1;
1879 pos += n_elems;
1880 } else if (idx_2 < size_2) {
1881 const size_t n_elems = size_2 - idx_2;
1882 pos += n_elems;
1883 }
1884 return pos;
1885}
1886
1887
1888
1889size_t fast_union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2,
1890 size_t size_2, uint16_t *buffer) {
1891#ifdef ROARING_VECTOR_OPERATIONS_ENABLED
1892 // compute union with smallest array first
1893 if (size_1 < size_2) {
1894 return union_vector16(set_1, (uint32_t)size_1,
1895 set_2, (uint32_t)size_2, buffer);
1896 } else {
1897 return union_vector16(set_2, (uint32_t)size_2,
1898 set_1, (uint32_t)size_1, buffer);
1899 }
1900#else
1901 // compute union with smallest array first
1902 if (size_1 < size_2) {
1903 return union_uint16(
1904 set_1, size_1, set_2, size_2, buffer);
1905 } else {
1906 return union_uint16(
1907 set_2, size_2, set_1, size_1, buffer);
1908 }
1909#endif
1910}
1911
1912bool memequals(const void *s1, const void *s2, size_t n) {
1913 if (n == 0) {
1914 return true;
1915 }
1916#ifdef USEAVX
1917 const uint8_t *ptr1 = (const uint8_t *)s1;
1918 const uint8_t *ptr2 = (const uint8_t *)s2;
1919 const uint8_t *end1 = ptr1 + n;
1920 const uint8_t *end8 = ptr1 + n/8*8;
1921 const uint8_t *end32 = ptr1 + n/32*32;
1922
1923 while (ptr1 < end32) {
1924 __m256i r1 = _mm256_loadu_si256((const __m256i*)ptr1);
1925 __m256i r2 = _mm256_loadu_si256((const __m256i*)ptr2);
1926 int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(r1, r2));
1927 if ((uint32_t)mask != UINT32_MAX) {
1928 return false;
1929 }
1930 ptr1 += 32;
1931 ptr2 += 32;
1932 }
1933
1934 while (ptr1 < end8) {
1935 uint64_t v1 = *((const uint64_t*)ptr1);
1936 uint64_t v2 = *((const uint64_t*)ptr2);
1937 if (v1 != v2) {
1938 return false;
1939 }
1940 ptr1 += 8;
1941 ptr2 += 8;
1942 }
1943
1944 while (ptr1 < end1) {
1945 if (*ptr1 != *ptr2) {
1946 return false;
1947 }
1948 ptr1++;
1949 ptr2++;
1950 }
1951
1952 return true;
1953#else
1954 return memcmp(s1, s2, n) == 0;
1955#endif
1956}
1957